From 5a789b760568b5afefbbb789c3c8215147f2f1dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Florian=20F=C3=B6rster?=
 <florian.foerster@mb.tu-chemnitz.de>
Date: Thu, 14 Nov 2024 16:40:00 +0100
Subject: [PATCH] added new test cases

---
 notebooks/misc.ipynb                          | 1917 ++++++++++++++++-
 notebooks/test.graphml                        |   37 +
 pyproject.toml                                |    2 +
 src/lang_main/analysis/graphs.py              |   57 +-
 src/lang_main/analysis/preprocessing.py       |   26 +-
 src/lang_main/analysis/tokens.py              |   77 +-
 src/lang_main/constants.py                    |   10 +-
 src/lang_main/io.py                           |    5 +-
 src/lang_main/pipelines/predefined.py         |    4 +-
 .../Dummy_Dataset_N_1000.csv                  |    0
 tests/_comparison_results/analyse_feature.pkl |  Bin 0 -> 25949 bytes
 .../analyse_feature.xlsx}                     |  Bin
 tests/_comparison_results/merge_cands.xlsx    |  Bin 0 -> 5989 bytes
 .../merge_similarity_candidates.pkl           |  Bin 0 -> 3512 bytes
 .../numeric_pre_filter.pkl                    |  Bin 0 -> 25819 bytes
 tests/_comparison_results/tk_graph_built.pkl  |  Bin 0 -> 1766 bytes
 tests/analysis/test_graphs.py                 |  148 +-
 tests/analysis/test_preprocessing.py          |   43 +
 tests/analysis/test_tokens.py                 |   79 +
 tests/conftest.py                             |   28 +-
 20 files changed, 2339 insertions(+), 94 deletions(-)
 create mode 100644 notebooks/test.graphml
 rename tests/{ => _comparison_results}/Dummy_Dataset_N_1000.csv (100%)
 create mode 100644 tests/_comparison_results/analyse_feature.pkl
 rename tests/{analyse_dataset.xlsx => _comparison_results/analyse_feature.xlsx} (100%)
 create mode 100644 tests/_comparison_results/merge_cands.xlsx
 create mode 100644 tests/_comparison_results/merge_similarity_candidates.pkl
 create mode 100644 tests/_comparison_results/numeric_pre_filter.pkl
 create mode 100644 tests/_comparison_results/tk_graph_built.pkl
 create mode 100644 tests/analysis/test_tokens.py

diff --git a/notebooks/misc.ipynb b/notebooks/misc.ipynb
index 5b47137..b354b50 100644
--- a/notebooks/misc.ipynb
+++ b/notebooks/misc.ipynb
@@ -21,17 +21,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 5,
    "id": "c0dab307-2c2c-41d2-9867-ec9ba82a8099",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loaded TOML config file successfully.\n"
+     ]
+    }
+   ],
    "source": [
-    "import networkx as nx"
+    "import networkx as nx\n",
+    "from lang_main.analysis import graphs"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 17,
    "id": "629f2051-7ef0-4ce0-a5ad-86b292cc20af",
    "metadata": {},
    "outputs": [],
@@ -56,7 +65,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 18,
    "id": "c4fd9997-1e41-49f1-b879-4b3a6571931d",
    "metadata": {},
    "outputs": [],
@@ -70,7 +79,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 19,
    "id": "bdf1c8d2-1093-420e-91fa-e2edd0cd72f1",
    "metadata": {},
    "outputs": [
@@ -85,7 +94,7 @@
        " (2, 1, {'weight': 6})]"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -96,7 +105,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 68,
    "id": "d017b2bc-9cd3-4124-afed-c6eabc07a540",
    "metadata": {},
    "outputs": [],
@@ -105,9 +114,582 @@
     "G.add_edges_from(edges_to_add)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "f8bbf276-3b07-41d6-ad74-778f09cbab96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "graphs.add_weighted_degree(G, 'weight', 'degree_weighted')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "id": "d7b6f917-23f6-44a4-bc8d-125f7658e4d5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "OutEdgeView([(1, 2), (1, 3), (1, 4), (2, 4), (2, 1), (3, 4)])"
+      ]
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "G.edges"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "473e9e25-d417-4a0a-bff2-7765de516a89",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "id": "0a48d11d-1f2b-475e-9ddf-bb9a3f67accb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "id": "e340377a-0df4-44ca-b18e-8b354e273eb9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_pth = Path.cwd() / 'test.graphml'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "id": "66677ad0-a1e5-4772-a0ba-7fbeeda55297",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nx.write_graphml(G, save_pth)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "id": "f01ebe25-56b9-410a-a2bf-d5a6e211de7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "G_load = nx.read_graphml(save_pth, node_type=int)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "10bfad35-1f96-41a1-9014-578313502e6c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "OutEdgeView([(1, 2), (1, 3), (1, 4), (2, 4), (2, 1), (3, 4)])"
+      ]
+     },
+     "execution_count": 85,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "G_load.edges"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "66189241-637e-4765-b6f0-6ff090b6ba0a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1af4ba3-ced8-425f-a730-da14fd8aab8e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "1efd5f4e-fd19-46fd-bb7e-b23bec724cdd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from lang_main.pipelines.predefined import STFR_MODEL"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "50ee13e1-e10e-4efe-8706-6ca321f6cf9a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sents = [\n",
+    "    'Kontrolle der Schmiernippel',\n",
+    "    'Kontrolle der Schmiersysteme',\n",
+    "]\n",
+    "'Kontrolle der Lichtschranken\n",
+    "Überprüfung der Spannrollen\n",
+    "Überprüfung der Druckventile\n",
+    "Kontrolle der Schmiernippel\n",
+    "Kontrolle der Schmiersysteme\n",
+    "Inspektion der Förderbänder\n",
+    "Reinigung der Luftfilter\n",
+    "Inspektion der Schutzabdeckungen\n",
+    "Überprüfung der Ölstände\n",
+    "'Überprüfung der Hydraulik'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "ca0b4089-d8cc-4566-a9ef-ed35b55d18b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embds = STFR_MODEL.encode(sents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "cff09ea6-04b9-4544-aee5-0a7e0bbda2d2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[1.0000, 0.8907],\n",
+       "        [0.8907, 1.0000]])"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "STFR_MODEL.similarity(embds, embds)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "45dc7050-9b6e-4c62-ba87-a74fb7985933",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "384"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "STFR_MODEL.max_seq_length"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54bf2e2a-7ada-4e4d-9e2e-1d17631e7d06",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 8,
+   "id": "c5d970e6-7bfd-4da0-82da-56a12e12a86c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "7dcf9e86-a7d3-436c-a705-cddb83e704bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = {\n",
+    "    'idx': [0,1,2,3,4],\n",
+    "    'data': ['test1', 'test2', 'test3', 'test4', 'test5']\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "0962d3af-e44d-4078-ac4f-dbd59e6a33eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = pd.DataFrame.from_dict(data)\n",
+    "df2 = pd.DataFrame.from_dict(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "5743636e-0330-4c7b-879b-0aa8ff6bfa53",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "bool((df1 == df2).all(axis=None))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "d88b4e70-012e-4dfe-ad52-4210386ed8fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "4afe4713-20d5-4626-a942-e28c4eff8d0a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p = Path(r'A:\\Arbeitsaufgaben\\lang-main\\tests\\_comparison_results')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "da810b1b-b5cf-4c18-ad26-eff156ccfd54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p_load = p / 'merge_similarity_candidates.pkl'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "aa5774ff-5be3-4a7a-92dc-09331f12ee2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1 = pd.read_pickle(p_load)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "e4ec576e-ec39-4981-99e7-75fdc7ac0979",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df2 = pd.read_pickle(p_load)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "dc24c3a0-484b-4019-8f2c-4913e36d9b1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df1_c = df1[['entry', 'len', 'num_occur', 'num_assoc_obj_ids']]\n",
+    "df2_c = df2[['entry', 'len', 'num_occur', 'num_assoc_obj_ids']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "83ade5ae-95f7-4f44-afb2-e1c5a2c5694c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>entry</th>\n",
+       "      <th>len</th>\n",
+       "      <th>num_occur</th>\n",
+       "      <th>num_assoc_obj_ids</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>41</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>61</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    entry   len  num_occur  num_assoc_obj_ids\n",
+       "41   True  True       True               True\n",
+       "22   True  True       True               True\n",
+       "13   True  True       True               True\n",
+       "6    True  True       True               True\n",
+       "29   True  True       True               True\n",
+       "10   True  True       True               True\n",
+       "17   True  True       True               True\n",
+       "61   True  True       True               True\n",
+       "5    True  True       True               True"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(df1_c == df2_c)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97d6dd4a-7f3d-4459-bf42-46d0bd087ccd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "35463772-bf3c-43b4-b536-cf4456b3f0f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dateutil import parser"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "fa9c87f8-a42c-447d-bbb3-9c9d6830bd04",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "8d6f97a6-dafa-439e-9d9e-c37515be81bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pattern_dates = re.compile(r'(\\d{1,2}\\.)?(\\d{1,2}\\.)?([\\d]{2,4})?')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "5a1a15e7-f9bb-463f-9c83-a12ff0f8328e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dates = ['22.05.', '08.2024', '22.05.2024', 'hallo', '22.1250.25']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "442beb19-06ca-46ce-9d64-2e6c632ffb3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "string = '22.1250.25'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "21e4a7c2-76f4-43bd-aeed-34e52ed53db3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "match = pattern_dates.search(string)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "3f5d75f6-58dd-43a6-abf3-80f581807554",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('22.', None, '1250')"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "match.groups()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "id": "306bcd91-8b87-47fe-96d4-cbc2a2bbad88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dates_recog = []\n",
+    "for date in dates:\n",
+    "    match = pattern_dates.search(date)\n",
+    "    date_found = any(match.groups())\n",
+    "    dates_recog.append(date_found)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "id": "4e996e9b-8d75-4060-984e-ee439bfd5d45",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[True, True, True, False, True]"
+      ]
+     },
+     "execution_count": 84,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dates_recog"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
    "id": "91d4094b-f886-4056-a697-5223f157f1d3",
    "metadata": {},
    "outputs": [],
@@ -118,17 +700,1326 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "518cada9-561a-4b96-b750-3d500d1d28b9",
+   "execution_count": null,
+   "id": "0dabae5f-89b6-4457-a4ef-17cc33c6d561",
    "metadata": {},
    "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "8830bbd6-ce01-475b-b492-455400319a9d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Loaded TOML config file successfully.\n"
+     ]
+    }
+   ],
    "source": [
-    "from lang_main.analysis import graphs"
+    "from lang_main import model_loader\n",
+    "from lang_main.analysis import tokens, graphs\n",
+    "\n",
+    "from lang_main.types import SpacyModelTypes"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 2,
+   "id": "ee31987c-9763-4952-8d83-bf9265430e74",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "A:\\Arbeitsaufgaben\\lang-main\\.venv\\Lib\\site-packages\\thinc\\shims\\pytorch.py:261: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+      "  model.load_state_dict(torch.load(filelike, map_location=device))\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = (\n",
+    "    'Ich ging am 22.05. mit ID 0912393 schnell über die Wiese zu einem Menschen, um ihm zu helfen. '\n",
+    "    'Ich konnte nicht mit ansehen, wie er Probleme beim Tragen '\n",
+    "    'seiner Tasche hatte.'\n",
+    ")\n",
+    "model = model_loader.load_spacy(\n",
+    "    model_name=SpacyModelTypes.DE_CORE_NEWS_SM,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e086ee66-95c3-4fbc-bd04-a16b0fcdb26a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "doc = model(sentence)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "120c886d-6f2d-48e1-a300-8f39d9771204",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from spacy import displacy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "30b5f152-be1f-43c6-8466-98de50a28443",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<span class=\"tex2jax_ignore\"><svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"de\" id=\"22b603d03c644eac83cadccad8539198-0\" class=\"displacy\" width=\"5475\" height=\"662.0\" direction=\"ltr\" style=\"max-width: none; height: 662.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">Ich</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">PRON</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"225\">ging</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"225\">VERB</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"400\">am</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"400\">ADP</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"575\">22.05.</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"575\">NUM</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"750\">mit</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"750\">ADP</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"925\">ID</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"925\">X</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1100\">0912393</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1100\">NUM</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1275\">schnell</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1275\">ADV</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1450\">über</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1450\">ADP</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1625\">die</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1625\">DET</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1800\">Wiese</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1800\">NOUN</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1975\">zu</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1975\">ADP</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"2150\">einem</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"2150\">DET</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"2325\">Menschen,</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"2325\">NOUN</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"2500\">um</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"2500\">SCONJ</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"2675\">ihm</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"2675\">PRON</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"2850\">zu</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"2850\">PART</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"3025\">helfen.</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"3025\">VERB</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"3200\">Ich</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"3200\">PRON</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"3375\">konnte</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"3375\">AUX</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"3550\">nicht</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"3550\">PART</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"3725\">mit</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"3725\">ADV</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"3900\">ansehen,</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"3900\">VERB</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"4075\">wie</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"4075\">SCONJ</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"4250\">er</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"4250\">PRON</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"4425\">Probleme</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"4425\">NOUN</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"4600\">beim</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"4600\">ADP</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"4775\">Tragen</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"4775\">NOUN</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"4950\">seiner</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"4950\">DET</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"5125\">Tasche</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"5125\">NOUN</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"572.0\">\n",
+       "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"5300\">hatte.</tspan>\n",
+       "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"5300\">VERB</tspan>\n",
+       "</text>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-0\" stroke-width=\"2px\" d=\"M70,527.0 C70,439.5 200.0,439.5 200.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">sb</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M70,529.0 L62,517.0 78,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-1\" stroke-width=\"2px\" d=\"M245,527.0 C245,439.5 375.0,439.5 375.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">mo</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M375.0,529.0 L383.0,517.0 367.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-2\" stroke-width=\"2px\" d=\"M420,527.0 C420,439.5 550.0,439.5 550.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nk</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M550.0,529.0 L558.0,517.0 542.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-3\" stroke-width=\"2px\" d=\"M245,527.0 C245,352.0 730.0,352.0 730.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">mo</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M730.0,529.0 L738.0,517.0 722.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-4\" stroke-width=\"2px\" d=\"M770,527.0 C770,439.5 900.0,439.5 900.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nk</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M900.0,529.0 L908.0,517.0 892.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-5\" stroke-width=\"2px\" d=\"M945,527.0 C945,439.5 1075.0,439.5 1075.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-5\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nk</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M1075.0,529.0 L1083.0,517.0 1067.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-6\" stroke-width=\"2px\" d=\"M245,527.0 C245,264.5 1260.0,264.5 1260.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-6\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">mo</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M1260.0,529.0 L1268.0,517.0 1252.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-7\" stroke-width=\"2px\" d=\"M245,527.0 C245,177.0 1440.0,177.0 1440.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-7\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">mo</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M1440.0,529.0 L1448.0,517.0 1432.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-8\" stroke-width=\"2px\" d=\"M1645,527.0 C1645,439.5 1775.0,439.5 1775.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-8\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nk</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M1645,529.0 L1637,517.0 1653,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-9\" stroke-width=\"2px\" d=\"M1470,527.0 C1470,352.0 1780.0,352.0 1780.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-9\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nk</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M1780.0,529.0 L1788.0,517.0 1772.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-10\" stroke-width=\"2px\" d=\"M245,527.0 C245,89.5 1970.0,89.5 1970.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-10\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">mo</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M1970.0,529.0 L1978.0,517.0 1962.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-11\" stroke-width=\"2px\" d=\"M2170,527.0 C2170,439.5 2300.0,439.5 2300.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-11\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nk</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M2170,529.0 L2162,517.0 2178,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-12\" stroke-width=\"2px\" d=\"M1995,527.0 C1995,352.0 2305.0,352.0 2305.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-12\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nk</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M2305.0,529.0 L2313.0,517.0 2297.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-13\" stroke-width=\"2px\" d=\"M2520,527.0 C2520,264.5 3010.0,264.5 3010.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-13\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">cp</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M2520,529.0 L2512,517.0 2528,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-14\" stroke-width=\"2px\" d=\"M2695,527.0 C2695,352.0 3005.0,352.0 3005.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-14\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">da</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M2695,529.0 L2687,517.0 2703,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-15\" stroke-width=\"2px\" d=\"M2870,527.0 C2870,439.5 3000.0,439.5 3000.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-15\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">pm</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M2870,529.0 L2862,517.0 2878,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-16\" stroke-width=\"2px\" d=\"M245,527.0 C245,2.0 3025.0,2.0 3025.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-16\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">mo</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M3025.0,529.0 L3033.0,517.0 3017.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-17\" stroke-width=\"2px\" d=\"M3220,527.0 C3220,439.5 3350.0,439.5 3350.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-17\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">sb</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M3220,529.0 L3212,517.0 3228,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-18\" stroke-width=\"2px\" d=\"M3395,527.0 C3395,439.5 3525.0,439.5 3525.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-18\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">ng</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M3525.0,529.0 L3533.0,517.0 3517.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-19\" stroke-width=\"2px\" d=\"M3745,527.0 C3745,439.5 3875.0,439.5 3875.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-19\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">mo</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M3745,529.0 L3737,517.0 3753,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-20\" stroke-width=\"2px\" d=\"M3395,527.0 C3395,352.0 3880.0,352.0 3880.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-20\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">oc</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M3880.0,529.0 L3888.0,517.0 3872.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-21\" stroke-width=\"2px\" d=\"M4095,527.0 C4095,89.5 5295.0,89.5 5295.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-21\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">mo</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M4095,529.0 L4087,517.0 4103,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-22\" stroke-width=\"2px\" d=\"M4270,527.0 C4270,177.0 5290.0,177.0 5290.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-22\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">sb</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M4270,529.0 L4262,517.0 4278,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-23\" stroke-width=\"2px\" d=\"M4445,527.0 C4445,264.5 5285.0,264.5 5285.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-23\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">oa</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M4445,529.0 L4437,517.0 4453,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-24\" stroke-width=\"2px\" d=\"M4445,527.0 C4445,439.5 4575.0,439.5 4575.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-24\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">mnr</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M4575.0,529.0 L4583.0,517.0 4567.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-25\" stroke-width=\"2px\" d=\"M4620,527.0 C4620,439.5 4750.0,439.5 4750.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-25\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nk</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M4750.0,529.0 L4758.0,517.0 4742.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-26\" stroke-width=\"2px\" d=\"M4970,527.0 C4970,439.5 5100.0,439.5 5100.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-26\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nk</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M4970,529.0 L4962,517.0 4978,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-27\" stroke-width=\"2px\" d=\"M4795,527.0 C4795,352.0 5105.0,352.0 5105.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-27\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">ag</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M5105.0,529.0 L5113.0,517.0 5097.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "\n",
+       "<g class=\"displacy-arrow\">\n",
+       "    <path class=\"displacy-arc\" id=\"arrow-22b603d03c644eac83cadccad8539198-0-28\" stroke-width=\"2px\" d=\"M3920,527.0 C3920,2.0 5300.0,2.0 5300.0,527.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
+       "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
+       "        <textPath xlink:href=\"#arrow-22b603d03c644eac83cadccad8539198-0-28\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">oc</textPath>\n",
+       "    </text>\n",
+       "    <path class=\"displacy-arrowhead\" d=\"M5300.0,529.0 L5308.0,517.0 5292.0,517.0\" fill=\"currentColor\"/>\n",
+       "</g>\n",
+       "</svg></span>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "displacy.render(doc, style=\"dep\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "944b2da6-2c2a-4a58-b0ad-b2f280b7fecb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sent = list(doc.sents)[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "ad8c1f0a-c46f-4b47-99d3-fa7e254ff570",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'konnte'"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word = sent[1]\n",
+    "word.text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "189207d4-d0e1-4b8a-be8d-f5328f37c9da",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Ich,\n",
+       " konnte,\n",
+       " nicht,\n",
+       " mit,\n",
+       " ansehen,\n",
+       " ,,\n",
+       " wie,\n",
+       " er,\n",
+       " Probleme,\n",
+       " beim,\n",
+       " Tragen,\n",
+       " seiner,\n",
+       " Tasche,\n",
+       " hatte,\n",
+       " .]"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(word.subtree)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "a83bdfab-4ada-482a-b0be-d093f115a6e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ich:\t\tPRON\n",
+      "konnte:\t\tAUX\n",
+      "nicht:\t\tPART\n",
+      "mit:\t\tADV\n",
+      "ansehen:\t\tVERB\n",
+      ",:\t\tPUNCT\n",
+      "wie:\t\tSCONJ\n",
+      "er:\t\tPRON\n",
+      "Probleme:\t\tNOUN\n",
+      "beim:\t\tADP\n",
+      "Tragen:\t\tNOUN\n",
+      "seiner:\t\tDET\n",
+      "Tasche:\t\tNOUN\n",
+      "hatte:\t\tVERB\n",
+      ".:\t\tPUNCT\n"
+     ]
+    }
+   ],
+   "source": [
+    "for token in word.subtree:\n",
+    "    print(f'{token}:\\t\\t{token.pos_}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "04194be3-7f30-4f02-a3ed-c2ca016652b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from lang_main.analysis import tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "ea169167-f55e-4574-92bc-54aafc75ccc7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'ging'"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word.text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "da9c5a7b-162d-4b99-b59e-f97bb765d08c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rel_descs = tokens.obtain_relevant_descendants(word)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "2fefb0dc-8285-4f42-9323-23b0bc9d8cc0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(0912393, schnell, Wiese, Menschen)"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tuple(rel_descs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "397e088f-743b-4554-a695-65d0ddaac8ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tk_graph = graphs.TokenGraph()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "fd46701c-e428-43f8-80d9-979e96094bf3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokens.add_doc_info_to_graph(tk_graph, doc, weight=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "fc860f52-4bdb-469f-be8b-901bea39224e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "NodeView(('gehen', '0912393', 'schnell', 'Wiese', 'Mensch', 'mit', 'Problem', 'Tragen', 'Tasche', 'ansehen', 'haben'))"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tk_graph.nodes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "56e49d63-7374-428f-a1b0-26e3d136ab9a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "OutEdgeView([('gehen', '0912393'), ('gehen', 'schnell'), ('gehen', 'Wiese'), ('gehen', 'Mensch'), ('mit', 'Problem'), ('mit', 'Tragen'), ('mit', 'Tasche'), ('Problem', 'Tragen'), ('Problem', 'Tasche'), ('Tragen', 'Tasche'), ('ansehen', 'mit'), ('ansehen', 'Problem'), ('ansehen', 'Tragen'), ('ansehen', 'Tasche'), ('haben', 'Problem'), ('haben', 'Tragen'), ('haben', 'Tasche')])"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tk_graph.edges"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "f0056e82-4ddc-4034-afc9-c25e3c2331b9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['gehen',\n",
+       " '0912393',\n",
+       " 'schnell',\n",
+       " 'Wiese',\n",
+       " 'Mensch',\n",
+       " 'mit',\n",
+       " 'Problem',\n",
+       " 'Tragen',\n",
+       " 'Tasche',\n",
+       " 'ansehen',\n",
+       " 'haben']"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(tk_graph.nodes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "ee506f29-a6d0-47b9-a980-0227fa1d2a59",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tkg, undir = graphs.pipe_rescale_graph_edge_weights(tk)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "29a82ea9-6a66-47d3-bdbf-e41284785bc9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0952</td>\n",
+       "      <td>0.7487</td>\n",
+       "      <td>0.9830</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.8959</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.9538</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.0000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     1       2       3       4\n",
+       "1  0.0  0.0952  0.7487  0.9830\n",
+       "2  1.0  0.0000  0.0000  0.8959\n",
+       "3  0.0  0.0000  0.0000  0.9538\n",
+       "4  0.0  0.0000  0.0000  0.0000"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nx.to_pandas_adjacency(tkg)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "id": "de96d4db-0c98-4957-a91d-6d12e51fe2ee",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'weight': np.float32(1.0)}"
+      ]
+     },
+     "execution_count": 60,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "undir[2][1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "c802f550-5200-41f5-882e-a8eb780bacf3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>1.0000</td>\n",
+       "      <td>0.0952</td>\n",
+       "      <td>0.9412</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1.0000</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.6864</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0.0952</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.0000</td>\n",
+       "      <td>0.8661</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0.9412</td>\n",
+       "      <td>0.6864</td>\n",
+       "      <td>0.8661</td>\n",
+       "      <td>0.0000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        1       2       3       4\n",
+       "1  0.0000  1.0000  0.0952  0.9412\n",
+       "2  1.0000  0.0000  0.0000  0.6864\n",
+       "3  0.0952  0.0000  0.0000  0.8661\n",
+       "4  0.9412  0.6864  0.8661  0.0000"
+      ]
+     },
+     "execution_count": 58,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nx.to_pandas_adjacency(undir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "84c78e9a-8b34-465c-9bc4-13b38fa0cc32",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "58fe4954-ce69-4442-b6fa-504f1466b1dc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tk.has_edge(1,2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "baad206f-94ab-495a-8cc2-87a873220401",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>7.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     1    2    3    4\n",
+       "1  0.0  7.0  2.0  5.0\n",
+       "2  7.0  0.0  0.0  3.0\n",
+       "3  2.0  0.0  0.0  4.0\n",
+       "4  5.0  3.0  4.0  0.0"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nx.to_pandas_adjacency(tk.undirected)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "996a303e-02db-496a-bd23-29c92d13d260",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "True\n",
+      "True\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(tk.undirected.has_edge(1,2))\n",
+    "print(tk.undirected.has_edge(2,1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "5dbe02a1-3883-44d7-836b-dd2c4d27f5f8",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "TypeError",
+     "evalue": "Graph.to_undirected() got an unexpected keyword argument 'inplace'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[52], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m filt \u001b[38;5;241m=\u001b[39m \u001b[43mgraphs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilter_graph_by_edge_weight\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtk\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mundirected\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m6\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[1;32mA:\\Arbeitsaufgaben\\lang-main\\src\\lang_main\\analysis\\graphs.py:230\u001b[0m, in \u001b[0;36mfilter_graph_by_edge_weight\u001b[1;34m(graph, bound_lower, bound_upper)\u001b[0m\n\u001b[0;32m    228\u001b[0m         filtered_graph\u001b[38;5;241m.\u001b[39mremove_edge(edge[\u001b[38;5;241m0\u001b[39m], edge[\u001b[38;5;241m1\u001b[39m])\n\u001b[0;32m    229\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m bound_upper \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m weight \u001b[38;5;241m>\u001b[39m bound_upper:\n\u001b[1;32m--> 230\u001b[0m         filtered_graph\u001b[38;5;241m.\u001b[39mremove_edge(edge[\u001b[38;5;241m0\u001b[39m], edge[\u001b[38;5;241m1\u001b[39m])\n\u001b[0;32m    232\u001b[0m filtered_graph\u001b[38;5;241m.\u001b[39mto_undirected(inplace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, logging\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m    233\u001b[0m filtered_graph\u001b[38;5;241m.\u001b[39mupdate_metadata(logging\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n",
+      "\u001b[1;31mTypeError\u001b[0m: Graph.to_undirected() got an unexpected keyword argument 'inplace'"
+     ]
+    }
+   ],
+   "source": [
+    "filt = graphs.filter_graph_by_edge_weight(tk.undirected, 2, 6)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "id": "3d9ecb28-23ef-48ac-9cee-86ace6be7af1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>5.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>3.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>4.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5.0</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>4.0</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     1    2    3    4\n",
+       "1  0.0  0.0  2.0  5.0\n",
+       "2  0.0  0.0  0.0  3.0\n",
+       "3  2.0  0.0  0.0  4.0\n",
+       "4  5.0  3.0  4.0  0.0"
+      ]
+     },
+     "execution_count": 53,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nx.to_pandas_adjacency(filt.undirected)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "42345f27-585f-4498-a4cc-50d17c9f9b69",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "too many values to unpack (expected 2)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[1;32mIn[54], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mfilt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43medges\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mweight\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n",
+      "File \u001b[1;32mA:\\Arbeitsaufgaben\\lang-main\\.venv\\Lib\\site-packages\\networkx\\classes\\reportviews.py:1095\u001b[0m, in \u001b[0;36mOutEdgeView.__getitem__\u001b[1;34m(self, e)\u001b[0m\n\u001b[0;32m   1090\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e, \u001b[38;5;28mslice\u001b[39m):\n\u001b[0;32m   1091\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m nx\u001b[38;5;241m.\u001b[39mNetworkXError(\n\u001b[0;32m   1092\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m does not support slicing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   1093\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtry list(G.edges)[\u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;241m.\u001b[39mstart\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;241m.\u001b[39mstop\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;241m.\u001b[39mstep\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m   1094\u001b[0m     )\n\u001b[1;32m-> 1095\u001b[0m u, v \u001b[38;5;241m=\u001b[39m e\n\u001b[0;32m   1096\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m   1097\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_adjdict[u][v]\n",
+      "\u001b[1;31mValueError\u001b[0m: too many values to unpack (expected 2)"
+     ]
+    }
+   ],
+   "source": [
+    "filt.edges"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17e7c931-d94e-43cf-ac97-bb6fccc1ee70",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "7dfa028e-d2e7-4390-bd36-b08b0a591b22",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 48,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filt.has_edge(1,2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "1b6e6938-1546-490a-9b64-e3d2f60d188d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filt.has_edge(2,1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "518cada9-561a-4b96-b750-3d500d1d28b9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[(1, 2), (1, 3), (1, 4), (2, 4), (2, 1), (3, 4)]"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(tk.edges)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "9830c614-5c16-41fd-8987-be3d421da34a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'degree_weighted': 14}"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tk.nodes[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "42b2bb65-534f-4c9c-b439-d5eec4b285e0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'degree_weighted': 10}"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tk.undirected.nodes[2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c937e70b-bd89-4c3b-aa09-5f0a63982c13",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
    "id": "3235f188-6e99-4855-aa3d-b0e04e3db319",
    "metadata": {},
    "outputs": [
@@ -144,7 +2035,7 @@
        " 'total_memory': 448}"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/notebooks/test.graphml b/notebooks/test.graphml
new file mode 100644
index 0000000..58011f8
--- /dev/null
+++ b/notebooks/test.graphml
@@ -0,0 +1,37 @@
+<?xml version='1.0' encoding='utf-8'?>
+<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">
+  <key id="d1" for="edge" attr.name="weight" attr.type="long" />
+  <key id="d0" for="node" attr.name="degree_weighted" attr.type="long" />
+  <graph edgedefault="directed">
+    <node id="1">
+      <data key="d0">14</data>
+    </node>
+    <node id="2">
+      <data key="d0">10</data>
+    </node>
+    <node id="3">
+      <data key="d0">6</data>
+    </node>
+    <node id="4">
+      <data key="d0">12</data>
+    </node>
+    <edge source="1" target="2">
+      <data key="d1">1</data>
+    </edge>
+    <edge source="1" target="3">
+      <data key="d1">2</data>
+    </edge>
+    <edge source="1" target="4">
+      <data key="d1">5</data>
+    </edge>
+    <edge source="2" target="4">
+      <data key="d1">3</data>
+    </edge>
+    <edge source="2" target="1">
+      <data key="d1">6</data>
+    </edge>
+    <edge source="3" target="4">
+      <data key="d1">4</data>
+    </edge>
+  </graph>
+</graphml>
diff --git a/pyproject.toml b/pyproject.toml
index 9df6b04..f40a111 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -118,6 +118,8 @@ exclude_also = [
     "def __repr__",
     "def __str__",
     "@overload",
+    "if logging",
+    "if TYPE_CHECKING",
 ]
 
 [tool.coverage.html]
diff --git a/src/lang_main/analysis/graphs.py b/src/lang_main/analysis/graphs.py
index ec0af65..3ebce80 100644
--- a/src/lang_main/analysis/graphs.py
+++ b/src/lang_main/analysis/graphs.py
@@ -198,8 +198,10 @@ def filter_graph_by_edge_weight(
     graph: TokenGraph,
     bound_lower: int | None,
     bound_upper: int | None,
+    property: str = 'weight',
 ) -> TokenGraph:
     """filters all edges which are within the provided bounds
+    inclusive limits: bound_lower <= edge_weight <= bound_upper are retained
 
     Parameters
     ----------
@@ -216,12 +218,12 @@ def filter_graph_by_edge_weight(
     original_graph_edges = copy.deepcopy(graph.edges)
     filtered_graph = graph.copy()
 
-    if not any([bound_lower, bound_upper]):
+    if not any((bound_lower, bound_upper)):
         logger.warning('No bounds provided, returning original graph.')
         return filtered_graph
 
     for edge in original_graph_edges:
-        weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
+        weight = typing.cast(int, filtered_graph[edge[0]][edge[1]][property])
         if bound_lower is not None and weight < bound_lower:
             filtered_graph.remove_edge(edge[0], edge[1])
         if bound_upper is not None and weight > bound_upper:
@@ -329,14 +331,12 @@ def static_graph_analysis(
     Parameters
     ----------
     tk_graph_directed : TokenGraph
-        token graph (directed) and  with rescaled edge weights
-    tk_graph_undirected : Graph
-        token graph (undirected) and  with rescaled edge weights
+        token graph (directed)
 
     Returns
     -------
-    tuple[TokenGraph, Graph]
-        token graph (directed) and undirected version with added weighted degree
+    tuple[TokenGraph]
+        token graph (directed) with included undirected version and calculated KPIs
     """
     graph = graph.copy()
     graph.perform_static_analysis()
@@ -559,12 +559,12 @@ class TokenGraph(DiGraph):
         return hash(self.__key())
     """
 
-    def copy(self) -> TokenGraph:
+    def copy(self) -> Self:
         """returns a (deep) copy of the graph
 
         Returns
         -------
-        TokenGraph
+        Self
             deep copy of the graph
         """
         return copy.deepcopy(self)
@@ -669,7 +669,7 @@ class TokenGraph(DiGraph):
 
         return token_graph, undirected
 
-    def perform_static_analysis(self):
+    def perform_static_analysis(self) -> None:
         """calculate different metrics directly on the data of the underlying graphs
         (directed and undirected)
 
@@ -717,16 +717,11 @@ class TokenGraph(DiGraph):
         saving_path = self._save_prepare(path=path, filename=filename)
 
         if directed:
-            target_graph = self._directed
-        elif not directed and self._undirected is not None:
-            target_graph = self._undirected
+            target_graph = self.directed
         else:
-            raise ValueError('No undirected graph available.')
+            target_graph = self.undirected
 
         save_to_GraphML(graph=target_graph, saving_path=saving_path)
-        # saving_path = saving_path.with_suffix('.graphml')
-        # nx.write_graphml(G=target_graph, path=saving_path)
-        # logger.info('Successfully saved graph as GraphML file under %s.', saving_path)
 
     def to_pickle(
         self,
@@ -743,13 +738,14 @@ class TokenGraph(DiGraph):
             filename to be given, by default None
         """
         saving_path = self._save_prepare(path=path, filename=filename)
-        saving_path = saving_path.with_suffix('.pickle')
+        saving_path = saving_path.with_suffix('.pkl')
         save_pickle(obj=self, path=saving_path)
 
     @classmethod
     def from_file(
         cls,
         path: Path,
+        node_type_graphml: type = str,
     ) -> Self:
         # !! no validity checks for pickle files
         # !! GraphML files not correct because not all properties
@@ -757,7 +753,7 @@ class TokenGraph(DiGraph):
         # TODO REWORK
         match path.suffix:
             case '.graphml':
-                graph = typing.cast(Self, nx.read_graphml(path, node_type=int))
+                graph = typing.cast(Self, nx.read_graphml(path, node_type=node_type_graphml))
                 logger.info('Successfully loaded graph from GraphML file %s.', path)
             case '.pkl' | '.pickle':
                 graph = typing.cast(Self, load_pickle(path))
@@ -767,17 +763,18 @@ class TokenGraph(DiGraph):
 
         return graph
 
-    @classmethod
-    def from_pickle(
-        cls,
-        path: str | Path,
-    ) -> Self:
-        if isinstance(path, str):
-            path = Path(path)
+    # TODO check removal
+    # @classmethod
+    # def from_pickle(
+    #     cls,
+    #     path: str | Path,
+    # ) -> Self:
+    #     if isinstance(path, str):
+    #         path = Path(path)
 
-        if path.suffix not in ('.pkl', '.pickle'):
-            raise ValueError('File format not supported.')
+    #     if path.suffix not in ('.pkl', '.pickle'):
+    #         raise ValueError('File format not supported.')
 
-        graph = typing.cast(Self, load_pickle(path))
+    #     graph = typing.cast(Self, load_pickle(path))
 
-        return graph
+    #     return graph
diff --git a/src/lang_main/analysis/preprocessing.py b/src/lang_main/analysis/preprocessing.py
index 69dac81..dcebabd 100644
--- a/src/lang_main/analysis/preprocessing.py
+++ b/src/lang_main/analysis/preprocessing.py
@@ -205,6 +205,30 @@ def numeric_pre_filter_feature(
     bound_lower: int | None,
     bound_upper: int | None,
 ) -> tuple[DataFrame]:
+    """filter DataFrame for a given numerical feature regarding their bounds
+    bounds are inclusive: entries (bound_lower <= entry <= bound_upper) are retained
+
+    Parameters
+    ----------
+    data : DataFrame
+        DataFrame to filter
+    feature : str
+        feature name to filter
+    bound_lower : int | None
+        lower bound of values to retain
+    bound_upper : int | None
+        upper bound of values to retain
+
+    Returns
+    -------
+    tuple[DataFrame]
+        filtered DataFrame
+
+    Raises
+    ------
+    ValueError
+        if no bounds are provided, at least one bound must be set
+    """
     if not any([bound_lower, bound_upper]):
         raise ValueError('No bounds for filtering provided')
 
@@ -228,7 +252,7 @@ def numeric_pre_filter_feature(
 # a more robust identification of duplicates negating negative side effects
 # of several disturbances like typos, escape characters, etc.
 # build mapping of embeddings for given model
-def merge_similarity_dupl(
+def merge_similarity_duplicates(
     data: DataFrame,
     model: SentenceTransformer,
     cos_sim_threshold: float,
diff --git a/src/lang_main/analysis/tokens.py b/src/lang_main/analysis/tokens.py
index 6b35b45..f9009e7 100644
--- a/src/lang_main/analysis/tokens.py
+++ b/src/lang_main/analysis/tokens.py
@@ -11,6 +11,7 @@ from lang_main.analysis.graphs import (
     TokenGraph,
     update_graph,
 )
+from lang_main.analysis.shared import pattern_dates
 from lang_main.constants import (
     POS_INDIRECT,
     POS_OF_INTEREST,
@@ -38,21 +39,40 @@ def is_str_date(
     string: str,
     fuzzy: bool = False,
 ) -> bool:
+    """not stable function to test strings for dates, not 100 percent reliable
+
+    Parameters
+    ----------
+    string : str
+        string to check for dates
+    fuzzy : bool, optional
+        whether to use dateutils.parser.pase fuzzy capability, by default False
+
+    Returns
+    -------
+    bool
+        indicates whether date was found or not
+    """
     try:
         # check if string is a number
         # if length is greater than 8, it is not a date
         int(string)
-        if len(string) > 8:
+        if len(string) not in {2, 4}:
             return False
     except ValueError:
         # not a number
         pass
 
     try:
-        parse(string, fuzzy=fuzzy)
+        parse(string, fuzzy=fuzzy, dayfirst=True, yearfirst=False)
         return True
     except ValueError:
-        return False
+        date_found: bool = False
+        match = pattern_dates.search(string)
+        if match is None:
+            return date_found
+        date_found = any(match.groups())
+        return date_found
 
 
 def obtain_relevant_descendants(
@@ -106,7 +126,7 @@ def add_doc_info_to_graph(
             if not (token.pos_ in POS_OF_INTEREST or token.tag_ in TAG_OF_INTEREST):
                 continue
             # skip token which are dates or times
-            if is_str_date(string=token.text):
+            if token.pos_ == 'NUM' and is_str_date(string=token.text):
                 continue
 
             relevant_descendants = obtain_relevant_descendants(token=token)
@@ -252,32 +272,33 @@ def build_token_graph_simple(
     return graph, docs_mapping
 
 
-def build_token_graph_old(
-    data: DataFrame,
-    model: SpacyModel,
-) -> tuple[TokenGraph]:
-    # empty NetworkX directed graph
-    # graph = nx.DiGraph()
-    graph = TokenGraph()
+# TODO check removal
+# def build_token_graph_old(
+#     data: DataFrame,
+#     model: SpacyModel,
+# ) -> tuple[TokenGraph]:
+#     # empty NetworkX directed graph
+#     # graph = nx.DiGraph()
+#     graph = TokenGraph()
 
-    for row in tqdm(data.itertuples(), total=len(data)):
-        # obtain properties from tuple
-        # attribute names must match with preprocessed data
-        entry_text = cast(str, row.entry)
-        weight = cast(int, row.num_occur)
+#     for row in tqdm(data.itertuples(), total=len(data)):
+#         # obtain properties from tuple
+#         # attribute names must match with preprocessed data
+#         entry_text = cast(str, row.entry)
+#         weight = cast(int, row.num_occur)
 
-        # get spacy model output
-        doc = model(entry_text)
+#         # get spacy model output
+#         doc = model(entry_text)
 
-        add_doc_info_to_graph(
-            graph=graph,
-            doc=doc,
-            weight=weight,
-        )
+#         add_doc_info_to_graph(
+#             graph=graph,
+#             doc=doc,
+#             weight=weight,
+#         )
 
-    # metadata
-    graph.update_metadata()
-    # convert to undirected
-    graph.to_undirected()
+#     # metadata
+#     graph.update_metadata()
+#     # convert to undirected
+#     graph.to_undirected()
 
-    return (graph,)
+#     return (graph,)
diff --git a/src/lang_main/constants.py b/src/lang_main/constants.py
index 4f27ccf..7b7f50c 100644
--- a/src/lang_main/constants.py
+++ b/src/lang_main/constants.py
@@ -43,6 +43,9 @@ LOGGING_TO_FILE: Final[bool] = CONFIG['logging']['file']
 LOGGING_TO_STDERR: Final[bool] = CONFIG['logging']['stderr']
 LOGGING_DEFAULT_GRAPHS: Final[bool] = False
 
+# ** pickling
+PICKLE_PROTOCOL_VERSION: Final[int] = 5
+
 # ** paths
 input_path_conf = Path.cwd() / Path(CONFIG['paths']['inputs'])
 INPUT_PATH_FOLDER: Final[Path] = input_path_conf.resolve()
@@ -91,12 +94,7 @@ else:
 STFR_MODEL_ARGS: Final[STFRModelArgs] = stfr_model_args
 # ** language dependency analysis
 # ** POS
-# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
-# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
-# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
-# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
-POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV'])
-# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
+POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV', 'NUM'])
 POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
 # ** TAG
 # TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
diff --git a/src/lang_main/io.py b/src/lang_main/io.py
index f5b3af4..9b985eb 100644
--- a/src/lang_main/io.py
+++ b/src/lang_main/io.py
@@ -4,6 +4,7 @@ import shutil
 from pathlib import Path
 from typing import Any
 
+from lang_main.constants import PICKLE_PROTOCOL_VERSION
 from lang_main.loggers import logger_shared_helpers as logger
 
 
@@ -39,7 +40,7 @@ def save_pickle(
     path: str | Path,
 ) -> None:
     with open(path, 'wb') as file:
-        pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL)
+        pickle.dump(obj, file, protocol=PICKLE_PROTOCOL_VERSION)
     logger.info('Saved file successfully under %s', path)
 
 
@@ -56,7 +57,7 @@ def encode_to_base64_str(
     obj: Any,
     encoding: str = 'utf-8',
 ) -> str:
-    serialised = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
+    serialised = pickle.dumps(obj, protocol=PICKLE_PROTOCOL_VERSION)
     b64_bytes = base64.b64encode(serialised)
     return b64_bytes.decode(encoding=encoding)
 
diff --git a/src/lang_main/pipelines/predefined.py b/src/lang_main/pipelines/predefined.py
index 0e073a0..22bb04d 100644
--- a/src/lang_main/pipelines/predefined.py
+++ b/src/lang_main/pipelines/predefined.py
@@ -5,7 +5,7 @@ from lang_main.analysis import graphs
 from lang_main.analysis.preprocessing import (
     analyse_feature,
     load_raw_data,
-    merge_similarity_dupl,
+    merge_similarity_duplicates,
     numeric_pre_filter_feature,
     remove_duplicates,
     remove_NA,
@@ -100,7 +100,7 @@ def build_merge_duplicates_pipe() -> Pipeline:
         },
     )
     pipe_merge.add(
-        merge_similarity_dupl,
+        merge_similarity_duplicates,
         {
             'model': STFR_MODEL,
             'cos_sim_threshold': THRESHOLD_SIMILARITY,
diff --git a/tests/Dummy_Dataset_N_1000.csv b/tests/_comparison_results/Dummy_Dataset_N_1000.csv
similarity index 100%
rename from tests/Dummy_Dataset_N_1000.csv
rename to tests/_comparison_results/Dummy_Dataset_N_1000.csv
diff --git a/tests/_comparison_results/analyse_feature.pkl b/tests/_comparison_results/analyse_feature.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..359919b0e234c34651e35212a15bda81f09ee170
GIT binary patch
literal 25949
zcmeI52bdGp`iGY;DrM;l>k?29kgft&WGSLUlT}nK>tuJb37fKJ63W`JfT*$DYwQIJ
z_TC%TwfEk8xxM%9{m#sLlF24<K)v_*Klcy&z`K+AzVH0bIo~OhOor86&Rc9%IDayg
zEoQvYw8u9jlGgZ8(u`TzO!rx4%ADi;er|U3iP___qcWZAV@=6yrp9X?iKnb&+>F}e
zV`kiJvXXW-vu$lO(a=2KJ<gt+%?!Y!J{k$y<8v<AOt1QMyd~1m9JT6$%EvK|r(-Rv
z#@9P)V(DlqVkVR3DhXSkv>Fm|JC#f~q?BbB$1<DgCh8rNbF)Upn(WLD!R)Eo#o19t
z2cxTT#@cMSSEfs2YE_G+oH{O?mOX8qd%jbq!?D=~3l{AEwfu4ZF{Y=o!PAbA%J%ij
zkl(Fjq#>KxraqKR#DeKi$Vy7IQRjc^wqeE2unq>{a<~@$14cWzzpRE&!k@rD!tNbi
zy8-Y}crrW%-UT0pU%>4<x&G7P9JmlB;6?B<colpXX2XLr9U}!zw`aSitB|HMs@G&Q
z9cE@z(*EILqt1Vo>JElYa1}fsUI0IXgQ@=Ra4&cbd;@+6KZn1Eld1mE@Obznd;$I(
z{tkAdy0vf?yaj#+v*8iO;5FIsXkF1~5#9q%g$Kbl_<J}Gp%1`L2pA7*;ps37FN6=l
z$KhM>_pl#AYvBH{37!nsz^CAwP*t$1R0X?IfdOzIn1;8(C*fEsI36AZkAP>uv*E4q
zX*ie)O@Z^_-SA!bXE=ZgOoGecweVh8RKdp(JQ7ZUhrw&%oA5pO0sIL50`5QsA}|Rv
zFbg{)yf+*NC&5MVGx$$9m5N*mKlfB@khkY<*Mi^^@Ok(ed>?LyusZl6+?9g&f&0Qn
z7>3ut`{0k^ui@8lG=h(Xr^6dO6&c{I$n^-^8XgDVg`dOT2(E%7;NI|17=n+(S78+b
z7s4g51^xp53H}|9MEGn^1^Sn&KwpF|fLFk);f?Ss_*?ilxD`TAgr~yGU>5}K0QZM0
z;3aS{Lep>~d<UvAQnCW|2w4Zug7Xn@F1!~$1fPH}!4F^sf)>Lg;2AIjpM!71N`x+j
zhrvd_3Uonm4V(;@z!mT!s8jFf9RC{jr2-eg+u*Zs4nnVnRS265>*0CuHuyL!>VYdL
znD`abIo<$og<rr<81{gp;4Ux$SHNGup$JaHR`?P81kOX~Y4CNA@P6LhzXCy9!QQYD
zo<PA@z~92JVGRWy43qE#xE{U;--lnqu?QOn_kv5{{ZO@dN#WZdbQatf-T{XpbPQ~T
z$HLp;`>-3r`oIW$6^=sCp)d$r;1A*5u&i+6Ra7FBZGXk-9N!DyhJS!V5PC7Z8eR|g
zBk-m0VfZ{8gs_v~neaOJ2rMgnCPEH?tKlc`NAM@`UvK~dYhVzj;Sb<2gdPCbz;*CS
zeIB6~!YBNMk3q<6Xux?e122V_L%my`iNG`AdGJ#B9Q+veN9Y822)qm)kKpUe3BM2l
zLtr)B9sU6R5*|Rohr<-y0N;Z@gFO*)8hil$3|1rTEciBj&m*|6cQ(<2ry(c|Tj0s?
zRQOl8BZ7Vn{{}xq$Uoq=6g(cD2+xLB!&~9U@KdOAx}@Me5V8oq1AhmnA!s%<;PLQg
z_-FV}xDCRm!x`{!_ybJ@-U*-a6FdW9F?bGq3U)z&4Nrm(!;usmf}JVQgbi>tTnjIU
z55TA4SFosswXm@Wn*b-l5Ihau1-C)yXm}2siC_z^g!jYY2ssFz0Uv^2Kqa(~H+Qwr
z(FiyXz6`fTz<zKUTnSHx&%gr_bUwTow!r{GX27}daJU|p6}l_J_J?}#J;w2$;Y0-O
z1Mh>sfxQq^2iL-1!C?s53GM<H!53kX&~*rCfG5B;@D_L^Lc;JKI2r+S;WD@i-VXaC
zU|+Zx-V8s4N?@h8%eBBh2uZ>eyb0DKXfE6zTJQ!q4MFF^XJAJJ^n_#Je0TyZD{wqQ
z4u?m=+u`HzQ}|EVgs@-2f5E*G7K87=qYx5?$H6Dd30zG<!{GyPHwsz=4}s0_6!<p$
z6+8$bx4@0?P53$Nf}mYJ!g_nVb|pgk!-4Ql_&WRz{2F#c$WCxE>`K9-;BoMmuopsX
zkDy*&K`j(;H+&AZQNXA0Oak5nUx&YiDGIn0Zh*JLHuw@$In~oEM6by^BA^-m4E`NH
zf}p41b_kdSZ-qlC$bcbufk!|OuYkD}Fb-Y@uce?31pW~mL7=1IgYXqk0o}a?=-tO|
z1Wv%|6mS=uMu2nSm!5##N(FqFKpj!m7tVw4!Joi?LDdH(1MWh=Iq(hm7uX%8*AnPC
zKZQ$i{1fa<fC=#Da3BG9_EWes4p+hy4zIx-P_#3=q@2Qux=L^m0r!MIgu4^?WskD1
z-p<g0?jcYw9B<KqaJ&uf=228K&=CYW9=<@JH(>_?4f0d87Xh;HIyjO*`=R6%KShIZ
zd<70Azz`Jd1RwSbG>QNR!$aVn1ULrX2vwq%Y$H8pKP13UQPhm%PvJ~IMMvRyK3t09
zBk&b?U%5cKO3$F=5(51gR^T|$6QGMXe{_HY2{4utzk;ne-sl(LX#!N^xHk@u!uS0G
zEF{1W;YCzo9~{r|3-Az*2jch<+z~|w`UQA|0BdkGaBP6fp~{PrjW`g;J>W)oB#w9Z
z1z3-Q3LM{pM<Bh~uL_ko%z#VbjyNX$0&I`NWVkP!2mj_*g{c@`2tR>m<G7Wdf`vHz
z2`<3#Q@FKXfL=6WH325#V8NID0(2(8j|i|G4ijL^FTgAu-+^}`{a)DBFThX~1PJgF
zjz{A7s$YOA0vru*fFBU3Q@H>E46lT*pr8kiFL)d~dnXd@xF3#}<9H5^Q9sACar_64
zaU7q5YSfi1@e&-5#_>jYmtTqVaqNJ@9+Y&LUy1#2T!7;)IBerrVjmo~!eKT1gI|gF
zQQ}z`J_dXEm3Tjn+v0E(yvEP*$2j&x@`E_8@^kEl<5f8R4*nIE?Z;DayaU6FDDio}
z61T%~Esi(A6@HHWFkB9gf%n6*Qd=qUbPO|;bdO(&FX1=|$2;Nfevb7RHoy~UoMZeP
zM^K`P<8N@R^K*>g_#(U<$E=^@u{eyu;aIqbpQDN6#qe492c(znj-xO<1~%ewx1Xcl
zJ>QJP<w$&^oK(FR{gRS?1t0X3)X6(7YKOBh_!8daDWRmpt{A=sFZOfz48td=%LjfA
zdtrD1ya%dgE-5L1!3}VoUkQJtbsj@n+|QvuhS$L%ehwWmd>ToQ`8jmO;b3@(pTk?!
zVGa&oz_P8h5{Ed3z5Pmf0f#7cc-GHh84i^=yyoY?`HH=9d=!?g!?qX%U|+uy`e1NA
zyu#067?FPp@9=Zj9>Y~M>#2SYJK^vs4$s4~m9E9%8aRUHE$grt!zf(oC+ScOKZe`;
zmGCl}Pr~q2KZiJx*W>uCpM#CV6dVroYo!x$_&bKf{Yp3khjlm{?AJ<X;xLuSSNWB2
zE)FJ!K|hC^ahQnV6hBE1;_w&h@JTs`-WXm_<Ue|(bo8zZ+VEox-tt5&Y4{fmJNOya
zW7yr#upfpW`5A7FVG~pXqNK)mX&&3pZ~;X|{E9w}h<o}O9*5y|enq!nINQ%KhT(30
zhD$Jf!_RPQ4FBq9xRETn!q4y|46lM^d!{D_!;w&AxD^Jkdki{w6IL61<!6xah%Oo7
zJU@dyJ+&zrf%6rr7CD1VwULOYl8I>43N%{Dz`RI9IAu44lV-fxif1$1wOtytk}b)$
zjiGeB$uX&uTfXCthn%k|K0ixt;%;7U#Z!@}+-*LjFn(P_I2MuH$|Ef;Ry5nbbk5qV
z?3AT$Ro`yExZPqkry_~CQ}mp+dy}Fv*tQ`q$YusDwj%LJlU6n_9ZH2FQMqkBn+ddY
zmC)%_s~K#x8k)sk)vdZ%d)xU@JLN<d<;ADxu4+u0>1d=mn;B7jR*F~=i7PdBW7~#U
zi@K3KQ+=RiryTK%t*9BX1L=5UKrGufrdp+j!;8Z#id*TJnTq7waED?eqith28jGY-
zRy5aICHi8C(+y@yZvU4GZ(kheKr7Z_CCyYi8B!6I;(qi(u2t2&|0SJs&C$9l8gYAM
zaA9}PQ@+7;L)cd3R~e>!O&}@#uVdF)aog2am20O3iPX3mX?vW=Fx|eaDGa(qzKYNw
zS|e@iQ<jrtLksi3X&g5Lb=BLY`QnJ=m*kdYvPh9`$dztICEt<}DZq*cq@<u!Oiap*
z1*fE>N@l6vnBwNGli_3~!&W3^FHa=pdj{bY!G;&sNh-TSRcLuZiTOsUH6?9Z#?_kA
z<I;dqTA;YJ88VV1R?u#?#7azRR7<52EzZ!5!~(hga#JRo89K)bN9AmOgyl?H<f|Hz
zpF1r|CK}Vud0T~Q2{fjY4Pi;>khGID(R|CcSIC!4oYbi@q>>3+4Odg0N|{mVLFpn9
zGa(nc(VZZY(X<*FPB<A9s%<P!i;H~8f{;XTMvr8Ld_5!)lLnHEQG-lE&Nro>B5~<S
znQUa_iVgxcS;1sN^`RA5YKG&<whhjNF;!LAl2i@MN?PGoGaOA?O;%K*wXc^xACvm2
z^xrWjKPoebq;;V~KxPh_Y|ll}wtG{pQK#zzp7t|!b(|{8z=%lbxMc_Qv?lVinHfvW
z=5$)s(`g?yU3g-6`#|C~S*n3lEnP1pRMtuphXXP>%9MVvB^J`x&R2IlW6zx%WTs6!
z<1w4rd7c%qResvKaJO?*ldCLJ-EAbzkaR>M6w(uIZLVS5tO}Uvw)IkMt>q*m&7@*T
zqp6n9+vkh!;|f+Q(9(2M$Z|Ssa#6ikNN2?Z&i9F=A~F&rU(`C|sl=Evl^|gQOWQWa
z8=I|UtIEmUSgYKomJw3ttUPG;wr@P37MJEktR*Q4k&uKKlyBpL9+YoM4PQd%sruxq
z>!iKRT22#7e>meeYC4BbiqAFEEh*bc8z;SMoMcUgrNym4exlYHo6QWXwIu%~wew4$
zVFs;e#7;!rj+SEPk**PU3C+n7C91EpnTbWs+-``^Y1_!)l5}<IKD!{roklc+&QLYe
zp(dH#8=c&<15H*-)J(Oi%*hojlB1?t5+viCHV8*jt@#lsUpA8wsS?Gr#<fP0&1s1w
zGfOtJOOfE>yqG5&7AKY4?hS?9Iezdgcg}JrPSx<9wjS%PMEgY0xigFTTUynPrDnOp
zKF(Q;lnJ5qOtEh|7K=F3ozwhN4^pd(nYP@iRHb#v{8JNBEFr5_s&p^pCrjIHId?eP
zG9C-M#Qlz*bckC8>4nkzx<geZli4U|-2Tr`p|(4Rnld{Llhk$wveerSc$PQit5$1?
zH`Vj9%yE(C1QGR0V@DHGhd}W{KU=S0daC?}*{^M*Y)n-zS}MC{JIi(=mXNKE+E%8M
ztt!vbwjEHt>4=jT)V5X@mZD{^&Y6l-uF8}tHOP;)mSiFrmCb^#f?7YmVMtD{^Q=TN
z`e($WMQcSoP-i8VC&p1(nTA~T&ZZ+!my%7cWJazZg1JF1nNc*pryLY-W(Uud=vCld
zs{^%3D`v}@ptd8jc$OG@`I$bD8$hZVWq?YnFo1N@yCWvvYDJplWcw!9JxP!3QzX-~
zb{B0cN)FK}^l?1Wtkc6y)WS?~E>EQW^1G7J`JthfrKly7kD41)X3R(ir8R6xB};~G
zHdEVf9>~vH-j#s~!855fsy&S!nAK{K=zW)(ALck0C8m=L(o}jSmj28&bAF!ctJgw1
zFxQgV+)Tz~Q0->O+7p$HU9RpD*G*ehm4Ydo*K_IIJKu64Cx6v$zWwy1w_tI#m9b^_
z7s_TP&0%jSdp|cTDQK~5T~zbsnkRn+P?c|OnIKf6n`}$vdM<y?BR-##-lZs4PbY_U
zo2U`xZgN+++nUMwx)pl3HFXwG=_hAt&0SDb<=5__fMaz5?W&yHugNAzwzRU*mbo~a
zDc+>GU6J3UNH^-sM)#UTUsdPUJ?Y89vFRj`+AcY>Y~jvBa-nD|#|Bb{YFt)(8I99D
zQ!v}IoFKYTjdin0&7EG`Tod)uivq1Gi*>FVZp6ao)@x+Wh^=VBCO&u4vtb+cjT6D_
zc%#&c>%gjSNV{pCO+lfuxLT6~Ny}`K2~zDq?ZAAqDQ=~7Z;Z>yaz-&b8MhnrGosC2
zQ#MS_zRbOdkWp{CQ()l`l;zbsBMfp}SU0>^!P(3}y+LCyk-v~|cZm5cu>%Ln`ela_
zNqKu9p;p>S@+L>h49W$T4Edy7rlpew1JvD})NySYH^VaT51OGbL!==*A|>3cf+az%
zy<XeG!YQZSZcV15Y-W5x7`Gku?Bd<j<#wQh_2|xRG@A5|q||QDjVPU*yZVU-7Rgno
z+8wGZDrbvYIbUrBoTY&l%w|UBI=bY}EFM@mZn3<9BX9jIk+&(F-KL#e|IJvy39069
z=juu&QZArcCH1B@zj=_}N<}O^JLg8AjJbAiQOKJI`FYHpxeD`%K`QO5CY>RAAb7hr
zw}wsk_L`o}oOReHXu<Z#O%hqMn$@*~(}M+bxOcEfl2m(BIoIqAf^4S0ds;0HP95F3
zLguWxL@2ewOv)Zy?Vti&j^tJ_N#=YDw3~W!%WZCLmun#B4To`MeVWxf5sl84D4^bI
zG3DI~nZv$mveC<g%oolTvE6LSJSELt?QWIjO#`;~s&-3r?|F?b88qM9c0kXiGTQSi
z56SMn6P3;E<&2tG+lIFFvY`s(GBCFfR8P^62~2h{A$d#Dx#Y7Y?c@!~pb|N;+*(~Y
zP`vMv_Sf}lcS*AQCTtGI1|(aOo)&SMcv#k`HiqQ>4l)A9Afw6{F0TcQVe)#w=qDU3
zuT>e_88!0Ca5)+!uT{xWjWN{NNq(Ckzl}Gh8Y7L}jTuIzyt=cnk1^R8ZR}<Y6dz?9
zkk=;|vy4gddX+I&jz);jOtBdzoF%bqgp=j(6l0u}Qf*8(c9pZW^4nmeuSDuCmJ=N%
z6P*}4$-mW(-*lswlvyLns*GvkJyGg0K#mn_rL4imF5*4Xsq188j2urF%WA3JP_Y>+
zzSUCFK+!+J=r0jeiK@;jN`HR(iM9UeD|*yVALpl1>Rc)Ddl@~A9!7VgTkfZ;Jom)=
z)5Y-n>1=GyPbcHM{d6?^emcmrTli6r5z)7L@Bv8$A9LtiaGg&pO5Rto#X&yM;RC)c
zt>iqCW=k$fN^BWG(g^A^&MmXXmhtfaWjyE`WAvivT^9;&I@#=$(2+|`E8i{hx0dq#
zS8}tZY_@}LMy*{pXVEPs+E5#Oe?7U`nzuM?%JiXc@$q(4?)DnB!`<W_)rLApyb=G)
zLv6FW{(nLH|A8U5$<3$@eKl*t|AnN|mHqaU<|em>9@X9=3r4jz(Cu1qSDkm*YlH7^
zH0w%eZDk{CLthEn&^O|@o)>&0Z*qJ3M%v`cXhYpRI`Vf<FCD3D&unsSH(8rDEL)_m
zt*<t1`29uv&t=UfYyH*|bSvm)`!6Pkw<`<gp7yo&)#h7?|9tak3EoN-Y--w9%GXfW
z+SgDQtsTmC;bt3_En1InuR%c<Y6GvP0)zI=;WaF1w@r?uBX~6xL~38;_iv~r6!n0v
zgzf=lP|zH{2JOXv_oBSjFVM2dMQKB=#T&7p<#p}61_ecG1K;-1MU`#9|B%6V{MexI
zLx+wMP5ssXD!RvByAGaL^;zwew|75|P#-v`bMBF%>#fhaRR8L-yl1qZwsNE8Bj{uI
z6+YZhA97go-1N*=!L;1|6N$@br(JBhD_A|N-l&jIaXP3+)f*kvziYDU!SsU)AHUyd
zt+ed%LDQBGF1N0aTPy1Klba4!iu<zuT`DRvUG$;z1ph{Ps(&`zGhC@2lppS!drZC@
z3~?TNzirS=$tRnQ^^wMvN{ie{n3QMlcXIB|?k>;4uTL~Iq~-3p9;V#k+EAYeE)!F+
ztW@U;UvnzfIj;Ucfd%QcQh{XyMbWY<`A5Vo8zNMD;6r#Q#AvyYR(}S#sOSN^LjB|h
z{hjo!dA$<qr{YLh=Z_u+dRT13>pHo2=6HLkpWHQ;WBn{N{Q$ucygnT2Z$a+Eu^z}f
zay$g~g>^6hN5XMXKT>H7$9upS><;zwWb}7=Ci1#|biy!>cY!H*1k_KfI-KLF@DO+y
zY=(<q6V%_yOmM6pgrOe}qMz~6!g>9`4E=<(19*K`I3F&AQ(zw$h6lkVFbMTi5%%X;
zZ&UQQ?DaFR>N#)28aM;)3#UW<mEKVtn{XA>UkE*p<CEZtumS4t6gP5g!K2`8cqFWa
zdY<2w<I~_^SPgf8li@VDJ8XqB;Vd{88gL$50QGlp^)sE;^7^T89XuJXh9|&d;puQC
zTmhHEG)%$})L-e{k7NBsJN+nv<9YoQxCTyu8fuk0Lx10H*{CiR74B?gsVPbRQxo~0
zLFE5CQ8T0dzfbC8@}(4KsysrbK>0R}+(va=Hq*5}B=_k$zYdfyR3ui|^^NHknXJu}
QRUcLN#n{<(b~-rz->A3Ll>h($

literal 0
HcmV?d00001

diff --git a/tests/analyse_dataset.xlsx b/tests/_comparison_results/analyse_feature.xlsx
similarity index 100%
rename from tests/analyse_dataset.xlsx
rename to tests/_comparison_results/analyse_feature.xlsx
diff --git a/tests/_comparison_results/merge_cands.xlsx b/tests/_comparison_results/merge_cands.xlsx
new file mode 100644
index 0000000000000000000000000000000000000000..e25b347bc3e77d740adb46166840951d5c5ac2d7
GIT binary patch
literal 5989
zcmZ`-1yq!4*B!cJXz3Q|80p5LQ;=422ubOXbbz5dlu%+2X{0-b?vU;h5RgVe(0{zH
zZ{2(Ozi(!}@2q#N+2?)EUi+NqJX>204V?@C0AK^s9h{96`!tkOkxwJYix_!XyI5(v
zxwyLXS-86L`ZzgijA`I?@#ABzwLkW%e;q|%K=?>1DyL<P*E6i{ffMxl=n!4T+1Iz9
zCXY9gnY}Ipe8RisBqZ2JAUnfyEj=`t8<o4|UoNm55>rJ@RKP%whRAc84Gt5)7aiWH
zuiX2@t7z*vmL~80QCx=|v0(@jue#4rvP6qZyyM3v9Zjkxmd=IoD_VDmTsP<Qpv#Rl
zMmts$rS`v47zVN6e?cyN2n7HD{xbzD7dOc79N>u>S_ppp@Lz)|{`-X`0Ps^jMomU0
z0i13g)fS$-D{ae9UssdU%FsGvt!|_P9l6SI<*#5`6Ie0U{AG%Z&8&KJm1sJ|02R9N
zK|PzUZP%Ur)Rea`h~$dB8<mPmzTk2veM(|g?_%aw^m_cfmII-5U>l+zEM@GG5}Z!U
z%WYA!Hx@2t)GzQlj5<7*c19>&WaIey`>`aS8CANncvz`JEeU5HD?{^Mo-6GSiZ=4a
zse=mCFHdq9cd1+vqXR?Nm(KoUO-;p_p=Y5YCeNa;6+yEu$@@ybz;>;|5@$r4M&q@|
z`>Rb_|0d?OX%-7HCIFCi9{?al6652@=V1qNg8aD){HEs6*u;H7lq~S5V%*2}(h09u
zIHazI)ZN}|vF0`OP?I4HXc+70Yn=!UYF82_9!euPOwwq|ltky=`!Nbu+SwKLJ^RXQ
zxf>2kIrTB0=Ubd<uIF1>GZ172%JJWOUvb=l3s-+uYitidhz1Ohk|35sqp{U|$cAQ2
z*G#DSSZ2P|EH3Lq?HzNel&sAIo^jjBj_c_dyO=T7Y%>UbCm+&_h2*1<8pyFX!fH;h
z9D8mSR-_?gd*XD0A<q0XhKxa@F7LNAjbZD0UELkeZDsTw`X(logF^=bQZSlSF7v0z
zu>}bv#Ip@Yy<T2+AHIzlT<`!nQLk;D_=wgGO-8%YqOa*R=X7PCJGK|d;nd{Yk6X`O
zE|e!07RTypO??Yk-Osek!i_c2nsV@7o{Ed?EgSbCXgkr0K$J)JmW+Gs+q2)lEq5r`
zqp*`#+}!sgA+Oz*onizSx_ZqFPokwgdvNWULOoB{yewyWZ1`0y68N*X2YtqV@?IK9
z(JeaxgAgFnJ)y4svO=ydYOO12(llXa>_M$IHTIkSkPD?yrk<sW+l%z(fdV1F>;Umm
zhm;`u5|sCc&6#7#H}1PzZ{H`+YRHeews7N|{9rM$h`8@8LFy(>Yb*b9?D8@Z&L%i^
zbcMUGDQU}~pH}&FY-}qh#*L+U`O+}pQcKC1W%ffyHS=tWA}jc9KTp#ZP_$NsK#)`h
zJ3Bn%xi&3o0ggagZus|_-csNiX+V7&cK>QD>|2S;8!3ZaLzM;ScVb2AjV$-i?h}qb
zu;7Xlmrq)j^3x5;QPQbaLB$WbduDu6J5I{7B<V;=MpE!F<-*fh*-1W#1!^dh{^Wu$
z{LGq?Jx<|cy;e<(x36Y-@4-H@D@5I?SV7|%VK*u1_Y>nP>me>}A`Km?Elsr|6V$%>
zn7)EF?oC-2kV{_L@}4lYTP;OM-HQ)Ub7PX;2_MbL8sLpl>BD4c^ZtX8M9<NgL_%QF
zGZEx}N23#f?eyWIs&5YSUKpa^<~&2Fbnb=@+1xe+hEEGQa>spB`b4R^Ma2GIW={}=
zmH);FrxmLI{B3arNV<q#5EtX5kzRV+&j(i|t+z$HJk%fM*TjNISIPI#4zVEKfHC+{
z<GEDsWW5MWfNkegw8@I1vq|@la)E_e5HE+7f!j!bpI$}sgE_Ic9S65|(m5xG*Ym8<
ziZ@0)EEEZ9A=PrM4L8w)$DG|byVfRbI750<5RoX79C;e&J>3h{)Y<KdMvewaldZjq
zTn}mPu`5ZlothCmA%%K3wJ&ukZME#8s^?Y1uA1~i_!72~NpyAI@8}->xP0oh#RBKo
zO;>23mTyCIYRX;`7tP5?Y{Z><oZy1E+UxP5-VmEdZw$yVaN3!hnE0{NsZ*JMB$-Cn
z2RqGU`kEV_F@{ix@bx##uubjN29Y^Ama=jy)|)rXo^X?`S)22`!+KSrgL?Y-iMfDD
zFB(@u-b?iu@UH7lx~bXbMAGJBA*9y1;(_-)C+5duUG^EIwk3lrkgConbYo9UPAl7<
zOVyBTj{P($>wk+6)TF>8QMO8<EL6=qyQkZ>!==*wM6?|&R7a9d)w21y?}esr1Bq81
z5vpKgv)Bheh=6V3=_(IC4z~%+h-5?K>EZNFW>&c$v)6UHcc=O3t?Pgi(YC~n+4)n2
z^1#dG&1>gm31#&x-yR@}2N;zll@^nNQgBQAYf=IxDauI7z@uZ-Q0qe4;EZd$)>Hh(
z2jrt=(qchi?pNWO&nDSO$f7FoF_<XbHjj>ypL@s5UUP>yLG=W!rzr;&B0Hso+!W^Y
z*v`rq*zv*^H}poDsoU7i)ewHp**y1uEk64sRYNJIc$WWqjlMd<J=3qD3talkaEAU$
za8=B8QG5mM*=JI2g@VA&fT*ZoV2Nd0Q-#(QTb4wP|49gA=FQAv!R7KA{>x7##&Byc
zpo|*UONJy&s~D4WkXbyLWm#cWS?_`XzHhMM8Gf#f=xPM`4Ykm8bg5V@u8Z8X?6%C}
zCjz@m0uXH?#xsG}b;9{unz)inKaU71ySn4n52M>r{=I<=2YMWNVgLXx$ZeJC&j#Y{
z;^yFP2Z4CF^ZoVn*Dey4F%-8bOcwqP<C+l4z({dqy(U0B#)k6-RSwgM5-gD5<X38(
zW*V8Ja?SYEbKZ8hxY@Vrr=AU&3B@)>E}mx*;_O5wVsfjhRQO^+OiSA3EU%5KtaY`O
zK7PMv!D;IHNuxW4pFX5kLyEWL$BWhy(<aB5x~mMfNNSg<QYwx6Bdzh%9*@5U<dA>0
z0)M&n%G5Wr;pJoCQ}j4@+tl@HzZ{7@@%X5Av|M&3b7&qs^g?1(aIq<GbTCxU*;Bwz
z!K@vwVl^pw>i(I2|CKw5jS%0EMuSg#3l&@EgY@PwBC7&X$p*Fb9Se?z)4sE!d>0pj
zw1^!(#3~3SIOwd#D;@Gvt?qn3I`Lj2+ywq(DTtF{wQ)yfz=ii2Tsm8`_%u<^qHbz{
zIcvCUF>`+Nq!hcV>4Mx(sAWuR=Y!Jsbnp37cd2{+-$}p5Q;yA~BWB8vr_<@HODw9F
z$fXo#5sE#5{sz1hKZ^U#ZRS8_`))4`MuYl5KWwfpZ!)pMsaUq0mgd!#Rx4i#Aff^?
zaK-P*(6)H>GV|S}DyzcQp@s?^;LT9}0c?X%R~A@DBY${5lw@Fhyu!5{T?x?$e{}!m
z#}VvvS_agLKgtFMd_(0F8I}r3GR#?H+o+em6!;d%5z;#B9)V$$NHlMGn7hu&Biw3Q
zF(-$gAr{wpp}nI4l1*Wx3eaqqqK&Ud&<SktumpC&st8%=A>CxTFm5pG2leqaGUh>i
z)BLqZ4}rU+xJqRmc*CV^j|K9Mvrq}LGdf_Zd7WT2xtg>!)^V~b)ELJj5TBK^)UA~F
zbh+QPOD*458tMAx8V+jHR@d+ATb@{>I`6-9L@KfqM!bz+F&5oGC)}nJWyXz=mwYV$
zC`lWSgBUYH7G0hb==^|*-U)^(3YW*rs{){s1?O9oXrT3BPVmDpQM*Vf@(nT8GYTlx
zhvB;X2GfC6zOuL@ZnH{?_yeDdbiMi?AxQ4Cj3y`Y-1C($tQNKr#0}>_eZ9t*Fb~QV
z2Rsf=e%C1h*yti5w4+_8=+{DRO3;qhi&C~iiNfH=zfP33A+@yYe~vnjuW|V*ve6@J
z#F=v6>2opj;-Tq^5}0sryHa<KFx5_dLvPEMvlyPInQYV-_BzT^t|^xSjEQY%iK8%K
zF;1v-MLZtS9hEK)Wxxn$4$%Rp%*X;CQ(|G0(ZWM|Iu;Buvl&BPEJrhc?){Os(x_1#
zg3Ilk^Rvt~OXgO0Z5&I1pi>*o{+69OQWW2lKHS-2cWNuu!+h80RQ@4OD!QCak)S$S
zREQOY3ch&*fQpr$2{r2-fd(sQ+oR&g_zxeULqG#+xu<B*$Y5CG@c;zu^((1J`oJJC
zb-V3th%|onF2CrP1IHleJQSGF_|CnIv=A}KxDksGsjWE<Cm`4GhD@hG90d;>?J6?Y
zuF~c{g&SEEZdB|e%p_Pdh#2$I825>p2@pNxeJPYmJ%}y7aU6(7GPUyS^=P_t6-vfW
zOJ_pFl+o#|4Gat-enIhB5^wQi<$JdLW|}%33S~hMD@TgOs2Fh06Ni3ACeECRjOtyS
zGAp}1LX)bQi<LF(=|@Z&0s?y$G`ynTcQ#ZIKcj9_QJxj8pzT^05?ux?i2@vnZYFA|
zD3*T?nuam)dVZZkUUfi5n3Cl*jR~io^-CXf2MlIyoK)=y`Np+=Lk>!O)<>aIe0q%5
z9QQaV8{Q|j;(tunS#R!^=cBkO&1+@Nlg`TX`35msuNGZA4}`I!(ZKgNS9qg+XgB!d
z6b~k;7+$`QU;xS#P6z0JV)f>lW32#v3AngG{M--Y^Cl?v-3w@R(wyb(DL|m@nx(>B
z_`2f5E*XZ-gZ|yBzg@=TyhjB9obUhuf<LX=-NV-r;{JQeU)EoY%j74%CGW;c(a_c@
zk}nXRR83uY4quLFdt)BTW>zFnV%^>*R7aY0K)8Ha^$vPM`?gWSTB;#`E1qtLSB&FB
z1m|Jaz6@l(a=*b6fi>k$pzKg;NtovKmY{XJ^j<=NbzMs{mtYT6Q^Y~4*nd0R+E($f
z_?%bc<4Ds(zsEdN?}A&^PqtnUa4rizc+S;jQwWvByq$FLQl!olg@qgJb(MstS6Tqm
zsh-!sp~5o`)obi8XA@l`zxM$opeeH_A(qu&U%M}T@i#<Inr-XNyYV<+YHNk`^_*Jy
zbsiJB>e4K_`fqa-2|Em!MBsB>W>aSEo#@c)#K?aDI=X=H60__KJXZTTJ?TJ}A3aOJ
zi2A18ZT$c*QVeB_KKRRSCz(Zedc6M4arWFZGn!Qo?gJI*RzB5mH->lvh%&}(?&;6W
z<!6#K-yqPK8na*2v71C`^&4xm5ni~#g6)jMuW~M0B?HqeE69oF+ZQ)&Ml@Rrg!Eg2
zMY0CyI7DL(gz7a9w9zzhF~n=CRCiFX+@4PVEY&!ceX>U+S#-)g;YT)!LuZg}M_q1K
z4>*v}M`(4XqRmYor^QZ6Wp{FBZymJ&^Et<w=_@H-%(4a{bWF*n>@jWSMQ%DWolF-3
z4tvniU1=)#`$fiE+p%HR7zx3|tO1D@38ft@0k`qKWxFk?%8CtsG{zI9<RiE(BDrPl
zDrV><&}ijG?z0w-{!{0AmxwAh2Sc*&O!!Ga=7{sKm7jH|LTLO#!a*4XABU=~6bdDU
z@~j7iQk7suOm2foRDelJxBkKe(l9yVHrG$s`Z-o_2e1dMq|s8@aMlw@rcB86a&cQC
z0}~@S#)guR3+n23C+R|yKufeRXJk#;SqJ-_Ne^irH9B6uYP}-(x1#Hn_M<wGN~S=n
z8YyaknH$8>osakTJv&Lw8C3vZwt@s=rv9NDRiJdPQUQk75V@28x;xRT`}{I?u7lAI
zsP-!-ho^!F^v;XceG%^1%inD}um7+N@X_IcdCrSGL(Ye4vOKw<gz<|xV(!mpZ2de4
z=b3tS_ZyXS+{FDR_Zo;dF4+aj_j1G=7F7k~`4_=LYQIL7#&AEq$*w3pxOxBZxN!LF
zf-xaa3GB*!o>-tB?Uhm?owM9la0a)stnpdh{dG28L?(6Bp3M)BoSX&Yu)9=VVf;JM
zOHY(yex!Ht0GTM!pVIw3Wm&qoIQ%Z&*n|m9q<aAlf)8SiJBO;M??1~3R>3A#@E<ew
zu1)Js-E^+3xUcd+eIx9C-|Wml>$5Bvkwl)trA6$SJMR;&$&s@q10KV}iXt>h8-l87
z662MfPawn2sN$8-<d?=|)mfc(a&B!sqa`7i3~5JH9)~7?DjQj&SWO<v{Az>I)1f%@
z;_te8`5(ceSF2%jHcICW3a?I$iqD1L2}Tq0<mB=`$rW6k+)Jby+j^9#(PORtfgNFv
z$8C6<C^GT6dF84Aw$j=&41Yt8GRNqzLK9i>YJdg0ox30QbprcX6;G1?g03l`3rckb
zj`N7Bve7i*c#_9Xlv}>v8IL{=7SFMwI&S)@7?(v&?V&A$l@R|bp&0jmEL&pXtVr_W
zTARpcUOT?8ksh~JjxAqK@%~M&(Eesq8Isf=NP?;Vko!kS|D^aYQPn0&5x(Nb533;A
z22d4!%0(^YEhVRfg<b(e9#xfe9ny3Hl<AuIjmFa2zggSzfuj5+bY>+;&g|F+A$<q*
zfC33xa`4PL8GGzS{sv<Jp5~rhEw(-FYA3_JkNt7#Iy{8Gm=+569!i%5#*3a1@xLr(
zC{nO~z`1R*W!mL<DcotC#E}5!*f+iYs96v^L;~c)w2!aeOAhjb%S1d~Iy7i}IHuH4
z#<X_6zi}tY%y4-q4Kl%5v_D3ViGqu>2gKRKOwZR9;{NoviprA5eygb04F1LQxsa%<
z>^^S{*Fgs!wEJoP=eL$F9xm~VA64H~5u~c+_g;Q;zhNt>w?ZElx0TZORUl&slbsGu
zHVntkVewSfB>7dC4SQVchP4sc_>!2@2UQE>g8Is@HpKWV4`htaB_F44kkoLv`n^Dv
z?UTls32L^;&5gASBGnTxN26PRXYV6ZSNWXCY8cZw*Sg+!zu=UZa7QlER^>kXylT`v
zJCvazYTG7!Pbae7cb9Q$sxdePy_msSJW3Ni<e-xTi<^}ZXY+(c<*iRnxWM^Z^q`$=
z$Gwv1e))ucQ?V_m_2K+|=*kO%Ykg6z?<@sg$LKwk!whHMZ|~olQ2P?)-X8hSRKSya
zwFqGAQWY}DGY<+X8Q_0gqDa;J^$A8Y{QnKoyXd=a<R2^m5R8KOC;C6$<X!k(&+c#d
z67u!`#lgE9;O?32KLP4O=&=6^@Lx}Gcca`LC;k&97^NRMT_Bh7_i%9+dbe-?4Sj<&
znE%nm?*i}k!M}m8kmpb2JO5d-cfoh-{BQ6Y$zS0AQT2D@+%2WQ<Mbdc)c;ag+G-d`
TixmLCL4Ku?9UMsh`|1Axu4Z=g

literal 0
HcmV?d00001

diff --git a/tests/_comparison_results/merge_similarity_candidates.pkl b/tests/_comparison_results/merge_similarity_candidates.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..02e1c74ea6aeb7e72b1e039fc4c29bc0743d6335
GIT binary patch
literal 3512
zcmd7VeTWow9LMp!J6Ao^?6|z{$>SoGJE#SX9*9g)yVTJWJQOUBVP<xAXS~_jZ5})?
zG!#1@r~2c5UReo|85Nj8VT3~HS=bX8JxTRP8c|>%B^6X)znOin+=I}6;VrwbnVa9t
ze!jn%x%<wx^!_6=QprzLSJK?9=4XteXJ&GqRxm|WyG#qTu4H*gbZ!@0MMpGAE#y5B
zwT&KbyMgJsn&W2*nycka&ll13B~H=kTU}lkLn3Nm57n`CKXY{#5zSOXw`3cAj;ZSL
zKFMG=ER+T^YLcc9I)SZuo;Dy4t9qtUbp5~!jUXPGPDYAoicFsj84^lrP%N#|#V)Z{
zbSPEIWTkeeux3W-Y%owV<DIJ4+$;9%E^n`is<w#k?(UUW<WKUUEDD6ar&ktRKRSk7
zHa*)A(G6<OD;D%Hmoq(iw2tBL%X65z6MIm{O}G>PL8q#`unkY+&-gpmR+q;$;0D}*
zPvBel9{zy0)s**Ngk87>i+BJJ;!AiAh1D2U+heBg{&7>6l2fO&4vMI1sR-o!Teo%$
zpQYSp%;NwK<1=_3n<@W3T!@G9bNmi3;ALDu`H$k`cp5*(i})K(q1+|743FS>6xLj&
zc~DrLl|s)DoO>#D59{l21TW(q1br842#~=gxEBRJi^uUKeu<ayW`efiO3dR99K`qW
z3yceFx>kXcDWCxtV~9ubG~P~u8C-|GxDTJkH}C^&rl7lVHNK7K@K0=@fcdx)U%_KI
zp};o@JO@wVNB9|jgR=?PgP-7C1X+v^VHPbsjPKx2cnPmyCxN%%UVLp-Aq}I?)T;!X
zhFkF*Ucj3O+=O%S0o;H&Jc(zqiGXWxJ(lnn`~&~Sc?5rOQ~|TDRX{yKyYWRlgs<a|
z_$&T}Qwh2qci}-y6KD>u#D07pn+Y1?3H%!4OX->gs07)KPvL3;4B;_6j;HWb{1#IL
zT8q874<q~#zrs3#K8%lG_P+{96Sxf*;Ck%G16aBAF0y_J>nY$FJc?(qi=c<FiC_y*
z#r=2`PvV#}5Va~rH}Hy%W432auYHwmSb=X?p62$Mu83xjyr7$2$s0M53*CG&q(?R=
zZnB5m-ZpXjGTCzUZ8F`!c4TkVGWPgAhE=d-6J(c4rXvr(!u3mLUtkyA<k(#!$2=KB
zA9>l8h^TR`X}fm5^3|)tT#&OJ*+q${ecZ0{$YJo9re{r~PmYhnw@wT<GVJ(4a&#GO
z;^yT8Sx*ZcyH6;sN=x!?Q5uzY`EHVH&GK`;(xA*Lzv6$|jhr=3PUWggG1;kjwrfV;
zu-C&5<JuEz-q^k_X-=-Y(EsZ^DmCL@rtH|Fsd^|cKiid!d)jwwBkq!wl<b_V;wD+C
zjz0%Q+!8mAZTz$5X4B8;nlEpOX{u}Xt1ILczFF>W?TXTs<>bL3D<39=b)!`m_vcpq
z)$Uxbv?Sg3bX^M!%gm~Fc5@s=-fEs~`D>i`ny;1Zw^}reP~IXlG~X{8YEj=Phsu$4
z@wTzQORAdW#*Z=G;m)`b-DB;OF{~%!i+ed2cPCc*m`dYT>2GGUz4BkQj`f9D`O_HR
z47FEphUd!PXvSYFpO@rAhm0RrUwN!j3ud4t)6px-)Pg`hFl`mlWHsk#`DF1H&2ftT
XzM2h7@@!~<sXCf&I=<NLhkE8;rxcrV

literal 0
HcmV?d00001

diff --git a/tests/_comparison_results/numeric_pre_filter.pkl b/tests/_comparison_results/numeric_pre_filter.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..c3013ef1a2c50f63d5c091095d0c9100f4db25eb
GIT binary patch
literal 25819
zcmeI52bdGp`iGY;O79CR1Qeu$fD{|DbcZIZs94s??(8OP%9=?itJqP*SnehEf(3hT
z*s!j>_uk9xy?5_-X5N!bHi-l3z0d!-f7pk8nauZn=XcKePMKsfoYe08S8L0hKk4p`
zW~|n<$JWFX*4Vm)8MQL$j<d|9Imh|;#>|M5Gix)$(`~Dx^@&V+pw~PcOInGT8L`Jk
z&6rtlCG1SPQ)MJx)3DGz&TP!2``}R>2?y=5S(i+@YjrBt7_MoESk*!0;~2+M(Z)4n
zs~t7bR3sTT6A5#Tgso0kHSw68Or&a(%CenfnMrpL^^VELj8V2OGqY7Nb4F%qX1LMH
zXm4ydJrnAhZdaRJ(`YHD){CcR&Rp-FZ<B6yd}h(2MF)N@f1F>&^kgP@=224Fo?aR9
zZz~b5$)tCwu1mzD!BkzHm5^w|FZk4L!?KC63I^dycsl$CjI?t9aT0tI{sjIJc5Ln1
z^?`@OQ{idwZultt0`Atv^`8#sz{M~QFNT-HtKqva6Y7_49nNdIeOokLnKYeIu`ZKt
zH8Ycx_74pne!;6$w?C|hYv2X&Liic%PxbeP`@w7B8}LK;Is84GK=qG>C%`A+3-IUg
zcd!H1t%S4St?)CL2@N&+ugio+=!!mz@Xl}wJOnnw-@`EoeE_yWz*txbH^2<M2tEWK
zhi}2(!(Ip-2oHqy@Km@CJ_X-|s)FUkD%hS1^nnM!6ucci2}e`GvG5Rh6g&%_3vYu@
z!~RrgGF%Anf$zdU!#-4CPq+eJ2e-h23O<J5VQ?}$0$vB-gzv!*;79NmaCa&Yh6$L4
z8Q2!#-QXCwCtLzQga3q6sK{0Db5F(kd3){-EeJjVpNFr(_u;MxtAa1Wy(stqI2YE!
z5WE)N2Y(EI4ZnsX5PUq`0B`hEq>r~EHz06lxE8((KZo5ATn>lA{o&!T4n7WFh2;ob
z441)1_zU<a_;)xA;j=vz=v}M=JrTMHUJ0*(H^HyqZ{gqIP6$02o&hh1?GUs(JP@ve
zm%{!CO~I}39jL}g(F#-}WIa3wE=0gaxCK50pMWpH4`3OBmcpapSuhQsgKxv`2we`3
zfVF-VXoui|Z~|NgSHX*+PQ9OV{A<{g3S0<phtI+}2)zcDBWwb!hUdfE;p4EN2d<=G
z;+IY5cr&~WegWHH*clFolVBXKg1>-+5S)Td@FVyMT!7Fs;p-mZy}Y@9C4zQ>-5{IL
zvJ)xzO88s&H5^ERhr$Fr5pIGn!uR2qa5Tck!2RGdct2DvUR3xl2%ZJ!!aLz$gpPzw
z@OXF!d>?i|SPvM6ufpL7IvfUJBm5z}2bL60yt3{HwcrN01-=dc00$uS5_k=~0nR7z
zW$<D6JnV<CQ{dU~diV$|DSReE4uU7aPvDQ>PvE~`9|R7BL70L+fP)cw5L^e>!>jap
zgkA)n@Dn}~A+w<Y7r->U3|;~CZh0mG&xYs2%iweHW7r#^<KSWNa(DuQZzv`FA_NS8
z6>x9(1NcjL5CtCzlW;S95B>~xLCBf#0r)dmfv|Jn+weV);GW*uL<^pZpb%_?r@}Mf
zU*R4I`ZfF;{174kfSo9KEIb*W3$KB<!H?mmP~~({!TTU&3490s4o*eTY-qp};4Sda
z@SkuOginVv;F0hLnh3lLKI12N2EwB7JoprBhX5O%0w0FMD7X%`r9czbz?0zV@Cx_<
zd>VcQ3tCtU8;!7Wa8FnV&xCiwT@X3~o(E?l*n+F!{cs3E4uNOEhu{}b3GLy{T`hD3
z0?vmo!%hg84_Cm|@KpE=JQzV2z)N5=3?O6%oClADn_x+ydm-#Vs0ZI;9RC@PN6-QA
zKKL8h6+u<-boeVc7$KwJB)9~=2n&R+M?eic5w3%`!lMxqg7?A^2$%;~z%}p=*c$<J
z;Zk@D{17UE-Mw9|1@=Hl0w&?juo6M@;DOMBH^Qk1+6bS4tr5@#j)V*0iLj)=u?RU5
z9u4n+kHb&lKVdz>ehL2t_eWS1z5|azNC>WlPnHsR5(N!`55Q>@v;-ao8{ld1ZTKsA
z2tsa!Tj87VbJz|+dwGO)^LFiOg!G1e;a%``_#5~&?0}F_a4Bq0!NcKN_)FLoA+|?Q
zSFfN(3b+S82b(G2Q+PH3Z-%eK-@+sXTn0D8J76<>396jx;uWIT<UJ730DlJm4j)0#
zQ*c)V%!0SUK@?=bI(VT+KxeOjc@!`PUJkFLpv?sS5gbaOW8s7F6;A;jy#?sq$20=R
z;dBbP8%`y_M);*CV25G>A0|+1l=XxQ;Ct{V@Ly2%LD7Jd2sj760sjI!qVzffJ?E!z
zIgWpVZ3!?A{v7rt;6y)#6LGi-CUJNT?vA2~@X}HW$LlJ=egxbX{t)g>;Fmqh+Iu@g
z2fCL)U2(is2g31oIL)J|XrQAAbOL;VKySiU1nTFfXg>mE;Pr4Af###+G(ScCaC`+0
zBESF?jDipQ1sYC(L*Zd?UjiHlZ-OdOi?)#-vmX-RrzmQ`@uzU6pQ2-MyZ|o8@e%k6
zysuOsU8QGGaw&m+49jrr>j}`#n?E|h!2}phiC@7c9B=Xq@H7D`aNHk<N8$T^0TvVB
zhwx&mZ~%_y`2~0g$AfYF2=0NRgZ%<LLV$HR8aUR#l~CnH(MIfx<34aJJQ~M4{Q_)4
zK^cy3!K0Ag;8%t2ILv^{;T||9`~vKT!vr`NE`WdYtHKlvFM^-Ib8+0sPr+gw{sb4{
z_$l1kFF;osv4Q~Oaj@XaegWDN;70`56^C&!>K9-Zj_<&`kiG@B_X{ux1pxxQgyYdT
zzUmjCoB+qd8{r28YEvpe0K==`D=6rU;|m_gw%&<EJI=@P3LMYFG2-WVE{^}eF^1z)
zP>s5xC0>f-u{hoY@AfNkA&#wZ*oTsi@GG$wj*D=dgu^a=CHBB!Cmc?KfAA~uK1w_X
z!^dD}zY_1qu@er*z-#>+e~e=nBtMAb8b8MlI9`q8@8Dly$$mTy$2&2+m=d4&D{)sG
zPsj0QxXRD57lte0aqxavQfd<=Zon{2N%#7d_!5qL;&>O_+t0BY!y0%pjdPrz<4{U8
zar_OARep|P9AAW2;F$4qJRXOUI2;f6@pCkByaYZA|A6$8-Ej<t$H7`0?(uWfyXRYw
zxDttPl#;6VqF++dui%58lG=EuMeT4724BLPJtY)%*bBqg;3a+zpJDg}b@{;0VLuEn
zg!e+#%ta*yFt`z}_bcI#w9aEli}^Y9#_)PLz|WyIhEF5uF+Yd)I2;NO^K*EMI?Tc0
z3s|z1R^t%Eu$x~AFW?ZN4$t~ItiYi=4zKw+aK3DR93O=x>(B{<0PN{k!e9)43h(rD
z*bT!qw9*-V4x?~*6o=<w$vRZxa4j54Yn60ZieUt<_LFoth9AS-{7QHk&8J{^hMz->
z$eVC{*3ZGlVKNR!__fl>IQ$*MA$}#Cg~NIr4)trLvvHV0<g5Kk*ocFPVbIUv797T7
zIN49qgE;(!I($;fp&N!b5c!WDDXqQJyf*w8gSR{piyHm~!&ZKV)fjg4Gwg-oM}CGo
zV^|N>fGDc*U7E-CGh9TGVZWl!B;vk)hHEjr-mmCJ3}^cpMlqb`XSfW*H~b8D#_+Fx
zhFi&^EBy>l!SHHWvS+$rFa!w&hC5;Ky2qfEH(|BGSAGU@kLaQi&i6Ce*HfFK5jbC_
zYLPWaR~YeFG7*nNtU#@m2rLNKgpzhmC}GALtXL+!Yx8A6E76!}-ddN6)jK9tatC+J
z@sRTa3(wDzd$Jo=TCrp}A~%l@$d6xD6N-lAF7a?<qZP@tES<CV8arvJJI{BWAF~^+
zhGaM%bBdnRyd@zjgUy>`f=s&KQY#z_*K1`9Qgz9?a76Ax&!ht_TqSfW*<=Q5t(pe0
zS9Pl>)ZTnS#7;WVMS0=rd24DDW-1bH$fSoBo|Ph2g=0#MUE91l+NkaePgfjl*-1zI
zQY&JH?LaD48xYIpt;r^-;gG^GOJY_kY9_<EHr&0?$Y|agibTW7q!r1wR*}9~=5&LZ
zl)L$*!n+m5IoOIeS_w0mO4O-{N^viGA=|3zR{o;S+2&|k6A8OL(m%hu7bxFgswQNs
z^2-fV?)pzi|LfRQR?K#_m1o;&Q9L<jM#>%|GEBEG2j&M|Cf_Wm5v}3oO-aj1vO)QI
z;53e#fx7A~(tK%H@=J0{GFhZZH)Kn<B9d>3uoPg$0#Z^?Dkdf+#)4B)LM5|QZ)9Qf
zR>^R(5+N&`v{%Lx^0k3bl3+vf>m-$3r7E;Cuf$xVRGN~ujbkcJ>2Ya5DJ@V~+6)=V
zVJm1iSYjn6164~U<BiVH4Mzjn{&G_$lO8n33Pt2>ZiHn`8s(c8lAoiNB;vIx=e(^#
zH3n)^iJFikbe*)5G|@uKwpYn_Mx4|sH>8qrTMbuJol2S!=|Slt5i>3uy4IZ_5|NY|
z8BRDE6sm13Pm7Cu$$~nG;EW#03i-B2JSq(&8KVZ7gj{G!KZRq`lQP-J$Q2y~tha)R
zxavbIu-pvA63v^P31f<?uqCM)n3b?XO=c*Pu<EUdL~B_ueLgDnQ|Z4)R(?cg5J~HN
zhk(o+G}*pOBF$TpO%bQ-1D^IXb#<I7%fJXr=$K^(^t2}OGwB)2%!X7-)zfJoHC=dO
zc>6%&)my58R4rXEBy_KoCJqH;a+E3kP)jVNubpq=c*dSPH^@wza>ipOJ#m2*wpD)G
zx^TC1RFkVLQr&GN%sT0acwL>IXe+Y~<7QRBOf_$kVk<2t8EGaJLmExBe9k^sbPrds
zT7j0Q>gz10vnCYOdzExnEZ}@SNGc*DLGnedGoDI}EKvy(Hn6;TYpk}xN;IjQoW@$^
zHnoh9Dre<Ev$uU?0kya^#G{P~Nr<>4M88}c=k=g`>1oI^I#1OnTU{sZW!7?<So*^m
zzY)_pbW(htnQBbhPTDx>J<v(kL`YiP3gjkgow1qp;7UvKUs5}_1R7@0iiGWW#O-J)
zW&!CMc9+ns98seBI+Gq>(9A7{_?+gg3@%Amr|z@!Qru}oGw2LeGgVhFvwN+Rdv>7S
zYK)l4CY3qaf<<z~R7-+noYMxOaIz^k0_D47G9p!?c-FY4aH1h4kz{7cq$d>!F3gJs
zvSD#jxp_-nojb?(pXJV3?!>7Y-qY5jy_IO02s(FWF@H;|y0O$Om*2-(i(xV$6rU;P
zrlQfXGu=7OKjjd$x|k`;ok~?&7tKF4Aw}b|Y9)*JLT<9O-IjBQqb=hxuS?vo+)0PH
zRghj7G1nccDw)h$Ipg+!ZVI*CIn<QdX|SZWGmxd;cEGc|DPOf(OS~zbmt~F%H^hmk
zR~kDKmpTLr7y8+H1=CaIH_U#`TV-RadeKtZHQQOX<I%Wmb=0;pm1t6Vma^@D>P<(S
zyr8zVval2^dsWU<q;gfJOsPR`v^6H;!H8@YbQRS4@eM<ALX~GFlF>gS7AaUOVu30v
zu`)h}%E~n4s&_UWfvTizawRje{SeF!a><N>@jdyFSOYtFrbMp-?^+$GOjuD{)
zk;Svf*vrlIf$RWM%_sv@T7?0mlinRMu_h~AFDF|zx$a4N<bVR1p0&GRTTyg~PNt7z
z;Rc-^ZldOAf^&Hy?U&n?jK~cQwJb#}nS9jTpfY1dA}Fn4ODb71bTjG77V|)E*7B|l
zObDJytyb-6^uVl8gGBGU)ci2VxhOH6T#%;HE3x!vwwZJDR8PGY+JSkN%;shyDuZg8
zA!|=WHg?&%OI$Z?RaNq)Y+lc%bGKZ})j9dAcJnQ#C%pxWsjZAHyT7_jde1rR4Q21=
zW+eqJm92|vzHIa4t^mq&tt}ITN_3NLsa((H&UwV=a?-mLW$WqWux=AIqTEgHDtB8m
zAy>D254Wby;wk;)EUnoKit^mrT@Y}zE}%t~v->sK1j&|GHrg^5XVQh66t^pKn-u9r
zec9+<ljy7J?7AmCnLjq21X9~2XO_+1c}Ol4Y~|QM%218TYA>U4x@QVzTb30>7pk#t
z)~mVGYnyGNiCNLEe}3cY6)|hX#xrlro;~T=o(=!TVLv-oD|O(iulyU5Y^rAqkgqJJ
zR^ULwGV5hJRQpXku+XfJSxMdYW3sZGG0VQi?X=wdXS1`E?UAz!b1xZWjGOL+mp|ZS
zG4+oBevS*Pg!h^^lkTgxW9%GqmksWIFqac{;9yy;?7DbD-VBJVb#+g9OCxCp<#I{}
zctS49Qi;6b=<Y?TxRQ*SA(`d-&CnMe(hwez5^h%B0-#n}uWf$el+$8wCKFF4JvJ|l
z+m3oJ@owR=`_2A(EN8bB^?LtNY;WgAlupiG>%;;}<Qh}$1=aPFvmx!iP%Zq<LO=^<
z(!;VHU34E73oIV9RNk(UH+q)In-b36($22-W;Ec0RI{~nEu|7E8&Iu@dJCJ|DoAf7
z!<L?lv*S%hSc^9#<gJ3-tmV#A`T4|<l=f9q%m6(Qyj`1Jv8H=_P0wA<s%sN8Z!_d3
zi7ZeJ>Po@s!MqvUJ6I%1D!i$jZFUAhCf(aTt(F3(j_zC`GgMW)F1gA~$c|g>mjYaT
zWY;T6=3EQ3m}s+$Yj!1<D<0?Vg)t?48r1s^wax}8px#(9<-G~Hy!fWcMlT97UpUvl
zc7rMNlr(pRyD^rx2-wW4+AYt%!!@F4&|GiZ0ll4;(VkmpNOt#qr%ZZ3XVgTSH#cvR
z?NcC|f!ST3dIW|{V6tDSlQ$5Z3p`uWPTqbDDv{$$t<3oY#ruwEZ(Xkz7a)6Y!&YE)
zK(Zy_X%VN12WN~5V?g$AKO<oDGs=x2@>;+cEUyQQUc&zJTDh^SF;HF^B1gmJwQ@Nc
zXbdt&$-l<Qzs4F<jA6#!#tfsoygE_X!<b-<Fs2!O#Yfo&<n?jJEMrf3z1$crM?=MD
zrq~P?&XU*zg%jlOWMhn!QejLt_L8%e@~{3zPl?n`EXO-a#yc@a$?posZ@QE?P;`|W
zQ^j|@)S{0ZD^^NL{f$ZDJItx+1Y@KePZ!GysoNm287;mQQchn{KhEea5mb4q#wtp0
zetL<u{^==d)K3rRr@Pd+yTtEmbTK*`9gPmzpZ4;|67Nqt!|$i9u{}R+jPLT(+VJ~n
zB@bQUM?L>T-=o1tA!U4^q3@k_K9?wZL&OdT`RIm^@^-Y6^DLPixg;sEV*p7bs82I@
z%o;n!!~d7@pl^ZEi=ua3$h*B{yHi3(E;g-vx5(dG%J*N%?Uu6L4!Rk&cHNu>H;-sT
zZSei|<aTS`;jk^!hrXG|+fmsYY19sPn|o9n>KyS#{4Wo+?e6;j1?~R_hTJwcqc-%_
ztPTGcl1f+h+fSO?+!}gRdyC8))!IO}Yu^2H-eIo|zQ56|E1|WOjI0fPC1^w6h~Ii%
z@Qu99?dcn7n=7LYb@S-R-#xu_q>??e&9&WTZQ8J8k-E0N+O*;KAMyJu`djCru9TMY
zU+imd&*V)dEgRfdn{Orl^Ub3rcq^5+M`>9pUqfAMUqfBAb|~3}+ih5~XgyrL26<hm
z4ZNE23|cma*D$Z$wmFiH;MJ5Dsb!JhzoC{;&;z;>x(Ad&UUT>wv=sl{i}F@KPs=tJ
zr46+fZ^XQo*R}H+<Q1t6eA`DCRk8v9Lk8dJGk}(#AGDTe>aYH-=pK75TX|m9XSG+(
z-u-kzeSV<Mxkrkww?6Aq|JHW81pT_UR<Gy$-PrDL^?LTllBbuacM7KDuAOj9J_K!N
z%gwy%iSb66d?3?GJuu#At$we|sK>+i%YO)dt+m>+#|BMXK7ri1I%ch^o-cO~tQPkb
zo!XU^rQ7L4=Q;Yd@@)M~s7t83dR%>|XZB(BaxlPo_<X0JnUs$xYpcVxtCbeH0Wcv?
zz;EN+RNYaYMqeGTsY%IAZ=FrKF}0>T9$X=&V%c4t%YV(OSlgI-s{f+Y=~96eT}07}
zZt_dStmq+B8`>p2nqh=o6{|meT$FW&?V)~tg8pXs&b;0o>SxnPSm%!(1bR^H!s|M<
zCvv<S)X&cv&9QzWnSPAlP+lJb^_Le9;8+jiJvbf!d%`LhfWzPzs2^-JlH+|~6n2FA
zsV@2}IOBO;Kfqux$CF?Z9tHI?qmJZw3Oo!R0UO{FSP%7ADdQaL$5H4<bm%8-G;&@)
zwn9Ir>>ys>3oe9<;bhnYhTtJ^84N=GY=Q$h)|(XlU3dNDscO#Ka3Guk=fdewf3bEr
z$0l3@_4hc}a(oIr8P-7k&E8s$EqDx^4UdMEP|xz6IPMQC;O=k&oC^1bO>ich1?NEn
zE`W=m{>rR=qSEQSeg<3*PlYGJ6XEf216&PP!Idxt6R-~IZ{W`7Sbr~0KM3FiUOx@4
zgX5rvTH7}0ug9$z)UK?|orNql70G`oBL6Rk{BI*_M%4fFNOe@cL*h)4N6FMDUw)DM
rpw?#6?W^nLwp!<(edYTP@l|$pZK_cwX)|e6N7Riic4obu3Xc6Z3Ol{~

literal 0
HcmV?d00001

diff --git a/tests/_comparison_results/tk_graph_built.pkl b/tests/_comparison_results/tk_graph_built.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..15b3a5d8a8ffda90b814dee8819c0109b1a5edd1
GIT binary patch
literal 1766
zcmb7_TZ<e;6vrKBXLovLcQ%AD#3cj~2Pa_07X@FE5Lh)b*(iw!USd<zUE8NJJ=H^9
zvb(v6qL2hi5z-1?5EQ|0;FE9jG5jKV^qlJMnaPemWTAIYojP^Szv}#|_KU{%|Exdf
zf7}ZxlSw#XLJpW@>5&$CkSI0<ox9ct*_g|BXp}$B&wZKS&-<?7XVG+xP-Zdr&S02H
zqp~!mq1%pmqBsu^xJbb8I3GT*-PTnRfzc5tCdU*8?O_ewvtLGBP1Wr0eJhi~y{293
z#*|4}Wh_G5yUIr6+guuvF4o_OU?RAZVmjpuck5lLr+jQgCW}y1(0P0Imx{R>&Hj-1
zXPzRs&V>}oygaIEUA6mWU!-O!Y#S?CbBB#$9*viSo3kHMZJ3M~8Lu40irG|*E0=X^
zA&YO}unz(K+Qf5OFAgZjbAVHLAbm6}%|M4jPLwV<Ss5=@#!IDPeGtvl`Yx?HUB#T8
z$B=6WzcZyrdyD2NA?C8!OKqblhaOr}g)J+EdNl#OqDhi3)`3?wDLURcIPW*%mQa9m
znlyPyU3gv3MeM+v8nC`wZWFwrNzY2&fhLJ7?$@++C&N&3bC9X=VHmLpco?E^w~=N^
zB9Nws#3-Ba9BM8IW1)CNI6@s)PPkz)Gi>3;ExSP4iN{*!!>wY(<AmcVaN83hLmHJ}
zW8cun4v+iVbS*IdBvVKE-urde_KPaVlbYK!nPKTX<VwT3c4*~XpzBuK8bm46nwNJH
z@7&Y35gK^B8&$8`c+Y&p=w|tbwTElkcg=f<6K@bK-k|RIRfLH55GURsSiC`;c!OZ^
z2GK``g_q0Ho@dYRF^~f|@JgP6<__HTfI$cD755^5Ll4sb!rg_h7I4umDhS_!2Ocx%
z!q=WHOnKkagvRG6|2o3bpl;z~cV^|t6`y9x+!p)*r<>O>>_fDxm(Z=3oIs*Su2E0|
z-@4{oNXT+vA0vx56Jpn-S0EOV$-&x@tJ9m{c2-J9nc`*Pt;_K(7I7IJEhojC%pe+=
z2CJ_Z4{na-Tf6@4U5@R;j}XHJuy7H-$6(z?MesE{LKUYLmthG)YVAkEE%<mCz$e2R
ze2U*q#2G%@ds2gNkElJNxQ<xL<1(SnAHg^9o&Q&Xm71t+t$0{|EquR0Pmh3vw%@a6
p<)~0;9e#qI9l_7=)RNzE$>y&3@7>jLcRk$QC9K(lKmD!-{{nA3qwxR$

literal 0
HcmV?d00001

diff --git a/tests/analysis/test_graphs.py b/tests/analysis/test_graphs.py
index b962f30..2d59df3 100644
--- a/tests/analysis/test_graphs.py
+++ b/tests/analysis/test_graphs.py
@@ -2,6 +2,7 @@ import networkx as nx
 import pytest
 
 from lang_main.analysis import graphs
+from lang_main.errors import EmptyEdgesError, EmptyGraphError, EdgePropertyNotContainedError
 
 TK_GRAPH_NAME = 'TEST_TOKEN_GRAPH'
 
@@ -40,13 +41,18 @@ def build_init_graph(token_graph: bool):
 
 
 @pytest.fixture(scope='module')
-def graph():
+def graph() -> graphs.DiGraph:
     return build_init_graph(token_graph=False)
 
 
 @pytest.fixture(scope='module')
-def tk_graph():
-    return build_init_graph(token_graph=True)
+def tk_graph() -> graphs.TokenGraph:
+    return build_init_graph(token_graph=True)  # type: ignore
+
+
+@pytest.fixture(scope='module')
+def tk_graph_undirected(tk_graph) -> graphs.Graph:
+    return tk_graph.undirected
 
 
 def test_graph_size(graph):
@@ -61,7 +67,45 @@ def test_save_to_GraphML(graph, tmp_path):
     assert saved_file.exists()
 
 
-def test_metadata_retrieval(graph):
+def test_save_load_pickle_tk_graph(tk_graph, tmp_path):
+    filename = 'test_save_tkg'
+    tk_graph.to_pickle(tmp_path, filename)
+    load_pth = (tmp_path / filename).with_suffix('.pkl')
+    assert load_pth.exists()
+    loaded_graph = graphs.TokenGraph.from_file(load_pth)
+    assert loaded_graph.nodes == tk_graph.nodes
+    assert loaded_graph.edges == tk_graph.edges
+    filename = None
+    tk_graph.to_pickle(tmp_path, filename)
+    load_pth = (tmp_path / tk_graph.name).with_suffix('.pkl')
+    assert load_pth.exists()
+    loaded_graph = graphs.TokenGraph.from_file(load_pth)
+    assert loaded_graph.nodes == tk_graph.nodes
+    assert loaded_graph.edges == tk_graph.edges
+
+
+@pytest.mark.parametrize(
+    'import_graph,directed', [('tk_graph', True), ('tk_graph_undirected', False)]
+)
+def test_save_load_GraphML_tk_graph(import_graph, tk_graph, directed, tmp_path, request):
+    test_graph = request.getfixturevalue(import_graph)
+    filename = 'test_save_tkg'
+    tk_graph.to_GraphML(tmp_path, filename, directed=directed)
+    load_pth = (tmp_path / filename).with_suffix('.graphml')
+    assert load_pth.exists()
+    loaded_graph = graphs.TokenGraph.from_file(load_pth, node_type_graphml=int)
+    assert loaded_graph.nodes == test_graph.nodes
+    assert loaded_graph.edges == test_graph.edges
+    filename = None
+    tk_graph.to_GraphML(tmp_path, filename, directed=directed)
+    load_pth = (tmp_path / tk_graph.name).with_suffix('.graphml')
+    assert load_pth.exists()
+    loaded_graph = graphs.TokenGraph.from_file(load_pth, node_type_graphml=int)
+    assert loaded_graph.nodes == test_graph.nodes
+    assert loaded_graph.edges == test_graph.edges
+
+
+def test_get_graph_metadata(graph):
     metadata = graphs.get_graph_metadata(graph)
     assert metadata['num_nodes'] == 4
     assert metadata['num_edges'] == 6
@@ -72,7 +116,7 @@ def test_metadata_retrieval(graph):
     assert metadata['total_memory'] == 448
 
 
-def test_graph_update_batch():
+def test_update_graph_batch():
     graph_obj = build_init_graph(token_graph=False)
     graphs.update_graph(graph_obj, batch=((4, 5), (5, 6)), weight_connection=8)
     metadata = graphs.get_graph_metadata(graph_obj)
@@ -82,7 +126,7 @@ def test_graph_update_batch():
     assert metadata['max_edge_weight'] == 8
 
 
-def test_graph_update_single_new():
+def test_update_graph_single_new():
     graph_obj = build_init_graph(token_graph=False)
     graphs.update_graph(graph_obj, parent=4, child=5, weight_connection=7)
     metadata = graphs.get_graph_metadata(graph_obj)
@@ -92,7 +136,7 @@ def test_graph_update_single_new():
     assert metadata['max_edge_weight'] == 7
 
 
-def test_graph_update_single_existing():
+def test_update_graph_single_existing():
     graph_obj = build_init_graph(token_graph=False)
     graphs.update_graph(graph_obj, parent=1, child=4, weight_connection=5)
     metadata = graphs.get_graph_metadata(graph_obj)
@@ -103,13 +147,13 @@ def test_graph_update_single_existing():
 
 
 @pytest.mark.parametrize('cast_int', [True, False])
-def test_graph_undirected_conversion(graph, cast_int):
+def test_convert_graph_to_undirected(graph, cast_int):
     graph_undir = graphs.convert_graph_to_undirected(graph, cast_int=cast_int)
     # edges: (1, 2, w=1) und (2, 1, w=6) --> undirected: (1, 2, w=7)
     assert graph_undir[1][2]['weight'] == pytest.approx(7.0)
 
 
-def test_graph_cytoscape_conversion(graph):
+def test_convert_graph_to_cytoscape(graph):
     cyto_graph, weight_data = graphs.convert_graph_to_cytoscape(graph)
     node = cyto_graph[0]
     edge = cyto_graph[-1]
@@ -144,7 +188,17 @@ def test_tk_graph_properties(tk_graph):
     assert metadata_undirected['total_memory'] == 392
 
 
-def test_graph_degree_filter(tk_graph):
+def test_filter_graph_by_edge_weight(tk_graph):
+    filtered_graph = graphs.filter_graph_by_edge_weight(
+        tk_graph,
+        bound_lower=2,
+        bound_upper=5,
+    )
+    assert not filtered_graph.has_edge(1, 2)
+    assert not filtered_graph.has_edge(2, 1)
+
+
+def test_filter_graph_by_node_degree(tk_graph):
     filtered_graph = graphs.filter_graph_by_node_degree(
         tk_graph,
         bound_lower=3,
@@ -153,7 +207,7 @@ def test_graph_degree_filter(tk_graph):
     assert len(filtered_graph.nodes) == 2
 
 
-def test_graph_edge_number_filter(tk_graph):
+def test_filter_graph_by_number_edges(tk_graph):
     number_edges_limit = 1
     filtered_graph = graphs.filter_graph_by_number_edges(
         tk_graph,
@@ -166,3 +220,75 @@ def test_graph_edge_number_filter(tk_graph):
         bound_upper=None,
     )
     assert len(filtered_graph.nodes) == 2, 'one edge should result in only two nodes'
+
+
+def test_add_weighted_degree():
+    graph_obj = build_init_graph(token_graph=False)
+    property_name = 'degree_weighted'
+    graphs.add_weighted_degree(graph_obj, 'weight', property_name)
+    assert graph_obj.nodes[1][property_name] == 14
+    assert graph_obj.nodes[2][property_name] == 10
+    assert graph_obj.nodes[3][property_name] == 6
+
+
+def test_static_graph_analysis():
+    graph_obj = build_init_graph(token_graph=True)
+    (graph_obj,) = graphs.static_graph_analysis(graph_obj)  # type: ignore
+    property_name = 'degree_weighted'
+    assert graph_obj.nodes[1][property_name] == 14
+    assert graph_obj.nodes[2][property_name] == 10
+    assert graph_obj.nodes[3][property_name] == 6
+    assert graph_obj.undirected.nodes[1][property_name] == 14
+    assert graph_obj.undirected.nodes[2][property_name] == 10
+    assert graph_obj.undirected.nodes[3][property_name] == 6
+
+
+def test_pipe_add_graph_metrics():
+    graph_obj = build_init_graph(token_graph=False)
+    graph_obj_undir = graphs.convert_graph_to_undirected(graph_obj, cast_int=True)
+    graph_collection = graphs.pipe_add_graph_metrics(graph_obj, graph_obj_undir)
+    property_name = 'degree_weighted'
+    assert graph_collection[0].nodes[1][property_name] == 14
+    assert graph_collection[0].nodes[2][property_name] == 10
+    assert graph_collection[0].nodes[3][property_name] == 6
+    assert graph_collection[1].nodes[1][property_name] == 14
+    assert graph_collection[1].nodes[2][property_name] == 10
+    assert graph_collection[1].nodes[3][property_name] == 6
+
+
+def test_pipe_rescale_graph_edge_weights(tk_graph):
+    rescaled_tkg, rescaled_undir = graphs.pipe_rescale_graph_edge_weights(tk_graph)
+    assert rescaled_tkg[2][1]['weight'] == pytest.approx(1.0)
+    assert rescaled_tkg[1][2]['weight'] == pytest.approx(0.0952)
+    assert rescaled_undir[2][1]['weight'] == pytest.approx(1.0)
+    assert rescaled_undir[1][2]['weight'] == pytest.approx(1.0)
+
+
+@pytest.mark.parametrize('import_graph', ['graph', 'tk_graph'])
+def test_rescale_edge_weights(import_graph, request):
+    test_graph = request.getfixturevalue(import_graph)
+    rescaled_graph = graphs.rescale_edge_weights(test_graph)
+    assert rescaled_graph[2][1]['weight'] == pytest.approx(1.0)
+    assert rescaled_graph[1][2]['weight'] == pytest.approx(0.0952)
+
+
+@pytest.mark.parametrize('import_graph', ['graph', 'tk_graph'])
+def test_verify_property(import_graph, request):
+    test_graph = request.getfixturevalue(import_graph)
+    test_property = 'centrality'
+    with pytest.raises(EdgePropertyNotContainedError):
+        graphs.verify_property(test_graph, property=test_property)
+    test_property = 'weight'
+    assert not graphs.verify_property(test_graph, property=test_property)
+
+
+def test_verify_non_empty_graph():
+    graph = nx.Graph()
+    with pytest.raises(EmptyGraphError):
+        graphs.verify_non_empty_graph(graph)
+    graph.add_nodes_from([1, 2, 3, 4])
+    with pytest.raises(EmptyEdgesError):
+        graphs.verify_non_empty_graph(graph, including_edges=True)
+    assert not graphs.verify_non_empty_graph(graph, including_edges=False)
+    graph.add_edges_from([(1, 2), (1, 3), (2, 4)])
+    assert not graphs.verify_non_empty_graph(graph, including_edges=True)
diff --git a/tests/analysis/test_preprocessing.py b/tests/analysis/test_preprocessing.py
index eb6caf9..bc87f15 100644
--- a/tests/analysis/test_preprocessing.py
+++ b/tests/analysis/test_preprocessing.py
@@ -2,8 +2,11 @@
 executed in in a pipeline
 """
 
+from pathlib import Path
+from lang_main import model_loader
 from lang_main.analysis import preprocessing as ppc
 from lang_main.analysis import shared
+from lang_main.types import LanguageModels, STFRModelTypes
 
 
 def test_load_data(raw_data_path, raw_data_date_cols):
@@ -71,3 +74,43 @@ def test_analyse_feature(raw_data_path, raw_data_date_cols):
 
     (data,) = ppc.analyse_feature(data, target_feature=target_features[0])
     assert len(data) == 139
+
+
+def test_numeric_pre_filter_feature(data_analyse_feature, data_numeric_pre_filter_feature):
+    # Dataset contains 139 entries. The feature "len" has a minimum value of 15,
+    # which occurs only once. If all values >= are retained only one entry should be
+    # filtered. This results in a total number of 138 entries.
+    (data,) = ppc.numeric_pre_filter_feature(
+        data=data_analyse_feature,
+        feature='len',
+        bound_lower=16,
+        bound_upper=None,
+    )
+    assert len(data) == 138
+    eval_merged = data[['entry', 'len', 'num_occur', 'num_assoc_obj_ids']]
+    eval_benchmark = data_numeric_pre_filter_feature[
+        ['entry', 'len', 'num_occur', 'num_assoc_obj_ids']
+    ]
+    assert bool((eval_merged == eval_benchmark).all(axis=None))
+
+
+def test_merge_similarity_duplicates(data_analyse_feature, data_merge_similarity_duplicates):
+    cos_sim_threshold = 0.8
+    # reduce dataset to 10 entries
+    data = data_analyse_feature.iloc[:10]
+    model = model_loader.load_sentence_transformer(
+        model_name=STFRModelTypes.ALL_MPNET_BASE_V2,
+    )
+    (merged_data,) = ppc.merge_similarity_duplicates(
+        data=data,
+        model=model,
+        cos_sim_threshold=cos_sim_threshold,
+    )
+    # constructed use case: with this threshold,
+    # 2 out of 10 entries are merged into one
+    assert len(merged_data) == 9
+    eval_merged = merged_data[['entry', 'len', 'num_occur', 'num_assoc_obj_ids']]
+    eval_benchmark = data_merge_similarity_duplicates[
+        ['entry', 'len', 'num_occur', 'num_assoc_obj_ids']
+    ]
+    assert bool((eval_merged == eval_benchmark).all(axis=None))
diff --git a/tests/analysis/test_tokens.py b/tests/analysis/test_tokens.py
new file mode 100644
index 0000000..dc16ef2
--- /dev/null
+++ b/tests/analysis/test_tokens.py
@@ -0,0 +1,79 @@
+from pathlib import Path
+
+import pytest
+
+from lang_main import model_loader
+from lang_main.analysis import graphs, tokens
+from lang_main.types import SpacyModelTypes
+
+SENTENCE = (
+    'Ich ging am 22.05. mit ID 0912393 schnell über die Wiese zu einem Menschen, '
+    'um ihm zu helfen. Ich konnte nicht mit ansehen, wie er Probleme beim Tragen '
+    'seiner Tasche hatte.'
+)
+
+
+@pytest.fixture(scope='module')
+def spacy_model():
+    model = model_loader.load_spacy(
+        model_name=SpacyModelTypes.DE_CORE_NEWS_SM,
+    )
+    return model
+
+
+def test_pre_clean_word():
+    string = 'Öl3bad2024prüfung'
+    assert tokens.pre_clean_word(string) == 'Ölbadprüfung'
+
+
+def test_is_str_date():
+    string = '22.05.'
+    assert tokens.is_str_date(string, fuzzy=True)
+    string = '22.05.2024'
+    assert tokens.is_str_date(string)
+    string = '22-05-2024'
+    assert tokens.is_str_date(string)
+    string = '9009090909'
+    assert not tokens.is_str_date(string)
+    string = 'hello347'
+    assert not tokens.is_str_date(string)
+
+
+# TODO: depends on fixed Constants
+def test_obtain_relevant_descendants(spacy_model):
+    doc = spacy_model(SENTENCE)
+    sent1 = tuple(doc.sents)[0]  # first sentence
+    word1 = sent1[1]  # word "ging" (POS:VERB)
+    descendants1 = ('0912393', 'schnell', 'Wiese', 'Menschen')
+    rel_descs = tokens.obtain_relevant_descendants(word1)
+    rel_descs = tuple((token.text for token in rel_descs))
+    assert descendants1 == rel_descs
+
+    sent2 = tuple(doc.sents)[1]  # first sentence
+    word2 = sent2[1]  # word "konnte" (POS:AUX)
+    descendants2 = ('mit', 'Probleme', 'Tragen', 'Tasche')
+    rel_descs = tokens.obtain_relevant_descendants(word2)
+    rel_descs = tuple((token.text for token in rel_descs))
+    assert descendants2 == rel_descs
+
+
+def test_add_doc_info_to_graph(spacy_model):
+    doc = spacy_model(SENTENCE)
+    tk_graph = graphs.TokenGraph()
+    tokens.add_doc_info_to_graph(tk_graph, doc, weight=2)
+    assert len(tk_graph.nodes) == 11
+    assert len(tk_graph.edges) == 17
+    assert '0912393' in tk_graph.nodes
+
+
+def test_build_token_graph(
+    data_merge_similarity_duplicates,
+    spacy_model,
+    data_tk_graph_built,
+):
+    tk_graph, _ = tokens.build_token_graph(
+        data=data_merge_similarity_duplicates,
+        model=spacy_model,
+    )
+    assert len(tk_graph.nodes) == len(data_tk_graph_built.nodes)
+    assert len(tk_graph.edges) == len(data_tk_graph_built.edges)
diff --git a/tests/conftest.py b/tests/conftest.py
index 244efcf..c2f44e6 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,5 +1,7 @@
 from pathlib import Path
+from lang_main.analysis import graphs
 
+import pandas as pd
 import pytest
 
 DATE_COLS: tuple[str, ...] = (
@@ -12,7 +14,7 @@ DATE_COLS: tuple[str, ...] = (
 
 @pytest.fixture(scope='session')
 def raw_data_path():
-    pth_data = Path('./tests/Dummy_Dataset_N_1000.csv')
+    pth_data = Path('./tests/_comparison_results/Dummy_Dataset_N_1000.csv')
     assert pth_data.exists()
 
     return pth_data
@@ -21,3 +23,27 @@ def raw_data_path():
 @pytest.fixture(scope='session')
 def raw_data_date_cols():
     return DATE_COLS
+
+
+@pytest.fixture(scope='session')
+def data_analyse_feature() -> pd.DataFrame:
+    pth_data = Path('./tests/_comparison_results/analyse_feature.pkl')
+    return pd.read_pickle(pth_data)
+
+
+@pytest.fixture(scope='session')
+def data_numeric_pre_filter_feature() -> pd.DataFrame:
+    pth_data = Path('./tests/_comparison_results/numeric_pre_filter.pkl')
+    return pd.read_pickle(pth_data)
+
+
+@pytest.fixture(scope='session')
+def data_merge_similarity_duplicates() -> pd.DataFrame:
+    pth_data = Path('./tests/_comparison_results/merge_similarity_candidates.pkl')
+    return pd.read_pickle(pth_data)
+
+
+@pytest.fixture(scope='session')
+def data_tk_graph_built():
+    pth_data = Path('./tests/_comparison_results/tk_graph_built.pkl')
+    return graphs.TokenGraph.from_file(pth_data)