From 5a789b760568b5afefbbb789c3c8215147f2f1dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20F=C3=B6rster?= Date: Thu, 14 Nov 2024 16:40:00 +0100 Subject: [PATCH] added new test cases --- notebooks/misc.ipynb | 1917 ++++++++++++++++- notebooks/test.graphml | 37 + pyproject.toml | 2 + src/lang_main/analysis/graphs.py | 57 +- src/lang_main/analysis/preprocessing.py | 26 +- src/lang_main/analysis/tokens.py | 77 +- src/lang_main/constants.py | 10 +- src/lang_main/io.py | 5 +- src/lang_main/pipelines/predefined.py | 4 +- .../Dummy_Dataset_N_1000.csv | 0 tests/_comparison_results/analyse_feature.pkl | Bin 0 -> 25949 bytes .../analyse_feature.xlsx} | Bin tests/_comparison_results/merge_cands.xlsx | Bin 0 -> 5989 bytes .../merge_similarity_candidates.pkl | Bin 0 -> 3512 bytes .../numeric_pre_filter.pkl | Bin 0 -> 25819 bytes tests/_comparison_results/tk_graph_built.pkl | Bin 0 -> 1766 bytes tests/analysis/test_graphs.py | 148 +- tests/analysis/test_preprocessing.py | 43 + tests/analysis/test_tokens.py | 79 + tests/conftest.py | 28 +- 20 files changed, 2339 insertions(+), 94 deletions(-) create mode 100644 notebooks/test.graphml rename tests/{ => _comparison_results}/Dummy_Dataset_N_1000.csv (100%) create mode 100644 tests/_comparison_results/analyse_feature.pkl rename tests/{analyse_dataset.xlsx => _comparison_results/analyse_feature.xlsx} (100%) create mode 100644 tests/_comparison_results/merge_cands.xlsx create mode 100644 tests/_comparison_results/merge_similarity_candidates.pkl create mode 100644 tests/_comparison_results/numeric_pre_filter.pkl create mode 100644 tests/_comparison_results/tk_graph_built.pkl create mode 100644 tests/analysis/test_tokens.py diff --git a/notebooks/misc.ipynb b/notebooks/misc.ipynb index 5b47137..b354b50 100644 --- a/notebooks/misc.ipynb +++ b/notebooks/misc.ipynb @@ -21,17 +21,26 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "id": "c0dab307-2c2c-41d2-9867-ec9ba82a8099", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loaded TOML config file successfully.\n" + ] + } + ], "source": [ - "import networkx as nx" + "import networkx as nx\n", + "from lang_main.analysis import graphs" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 17, "id": "629f2051-7ef0-4ce0-a5ad-86b292cc20af", "metadata": {}, "outputs": [], @@ -56,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 18, "id": "c4fd9997-1e41-49f1-b879-4b3a6571931d", "metadata": {}, "outputs": [], @@ -70,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 19, "id": "bdf1c8d2-1093-420e-91fa-e2edd0cd72f1", "metadata": {}, "outputs": [ @@ -85,7 +94,7 @@ " (2, 1, {'weight': 6})]" ] }, - "execution_count": 4, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -96,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 68, "id": "d017b2bc-9cd3-4124-afed-c6eabc07a540", "metadata": {}, "outputs": [], @@ -105,9 +114,582 @@ "G.add_edges_from(edges_to_add)" ] }, + { + "cell_type": "code", + "execution_count": 69, + "id": "f8bbf276-3b07-41d6-ad74-778f09cbab96", + "metadata": {}, + "outputs": [], + "source": [ + "graphs.add_weighted_degree(G, 'weight', 'degree_weighted')" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "d7b6f917-23f6-44a4-bc8d-125f7658e4d5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "OutEdgeView([(1, 2), (1, 3), (1, 4), (2, 4), (2, 1), (3, 4)])" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "G.edges" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "473e9e25-d417-4a0a-bff2-7765de516a89", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "0a48d11d-1f2b-475e-9ddf-bb9a3f67accb", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "e340377a-0df4-44ca-b18e-8b354e273eb9", + "metadata": {}, + "outputs": [], + "source": [ + "save_pth = Path.cwd() / 'test.graphml'" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "66677ad0-a1e5-4772-a0ba-7fbeeda55297", + "metadata": {}, + "outputs": [], + "source": [ + "nx.write_graphml(G, save_pth)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "f01ebe25-56b9-410a-a2bf-d5a6e211de7a", + "metadata": {}, + "outputs": [], + "source": [ + "G_load = nx.read_graphml(save_pth, node_type=int)" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "10bfad35-1f96-41a1-9014-578313502e6c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "OutEdgeView([(1, 2), (1, 3), (1, 4), (2, 4), (2, 1), (3, 4)])" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "G_load.edges" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66189241-637e-4765-b6f0-6ff090b6ba0a", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1af4ba3-ced8-425f-a730-da14fd8aab8e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1efd5f4e-fd19-46fd-bb7e-b23bec724cdd", + "metadata": {}, + "outputs": [], + "source": [ + "from lang_main.pipelines.predefined import STFR_MODEL" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "50ee13e1-e10e-4efe-8706-6ca321f6cf9a", + "metadata": {}, + "outputs": [], + "source": [ + "sents = [\n", + " 'Kontrolle der Schmiernippel',\n", + " 'Kontrolle der Schmiersysteme',\n", + "]\n", + "'Kontrolle der Lichtschranken\n", + "Überprüfung der Spannrollen\n", + "Überprüfung der Druckventile\n", + "Kontrolle der Schmiernippel\n", + "Kontrolle der Schmiersysteme\n", + "Inspektion der Förderbänder\n", + "Reinigung der Luftfilter\n", + "Inspektion der Schutzabdeckungen\n", + "Überprüfung der Ölstände\n", + "'Überprüfung der Hydraulik'\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ca0b4089-d8cc-4566-a9ef-ed35b55d18b0", + "metadata": {}, + "outputs": [], + "source": [ + "embds = STFR_MODEL.encode(sents)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cff09ea6-04b9-4544-aee5-0a7e0bbda2d2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[1.0000, 0.8907],\n", + " [0.8907, 1.0000]])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "STFR_MODEL.similarity(embds, embds)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "45dc7050-9b6e-4c62-ba87-a74fb7985933", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "384" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "STFR_MODEL.max_seq_length" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54bf2e2a-7ada-4e4d-9e2e-1d17631e7d06", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 8, + "id": "c5d970e6-7bfd-4da0-82da-56a12e12a86c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7dcf9e86-a7d3-436c-a705-cddb83e704bd", + "metadata": {}, + "outputs": [], + "source": [ + "data = {\n", + " 'idx': [0,1,2,3,4],\n", + " 'data': ['test1', 'test2', 'test3', 'test4', 'test5']\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0962d3af-e44d-4078-ac4f-dbd59e6a33eb", + "metadata": {}, + "outputs": [], + "source": [ + "df1 = pd.DataFrame.from_dict(data)\n", + "df2 = pd.DataFrame.from_dict(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "5743636e-0330-4c7b-879b-0aa8ff6bfa53", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bool((df1 == df2).all(axis=None))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "d88b4e70-012e-4dfe-ad52-4210386ed8fd", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "4afe4713-20d5-4626-a942-e28c4eff8d0a", + "metadata": {}, + "outputs": [], + "source": [ + "p = Path(r'A:\\Arbeitsaufgaben\\lang-main\\tests\\_comparison_results')" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "da810b1b-b5cf-4c18-ad26-eff156ccfd54", + "metadata": {}, + "outputs": [], + "source": [ + "p_load = p / 'merge_similarity_candidates.pkl'" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "aa5774ff-5be3-4a7a-92dc-09331f12ee2d", + "metadata": {}, + "outputs": [], + "source": [ + "df1 = pd.read_pickle(p_load)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "e4ec576e-ec39-4981-99e7-75fdc7ac0979", + "metadata": {}, + "outputs": [], + "source": [ + "df2 = pd.read_pickle(p_load)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "dc24c3a0-484b-4019-8f2c-4913e36d9b1b", + "metadata": {}, + "outputs": [], + "source": [ + "df1_c = df1[['entry', 'len', 'num_occur', 'num_assoc_obj_ids']]\n", + "df2_c = df2[['entry', 'len', 'num_occur', 'num_assoc_obj_ids']]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "83ade5ae-95f7-4f44-afb2-e1c5a2c5694c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
entrylennum_occurnum_assoc_obj_ids
41TrueTrueTrueTrue
22TrueTrueTrueTrue
13TrueTrueTrueTrue
6TrueTrueTrueTrue
29TrueTrueTrueTrue
10TrueTrueTrueTrue
17TrueTrueTrueTrue
61TrueTrueTrueTrue
5TrueTrueTrueTrue
\n", + "
" + ], + "text/plain": [ + " entry len num_occur num_assoc_obj_ids\n", + "41 True True True True\n", + "22 True True True True\n", + "13 True True True True\n", + "6 True True True True\n", + "29 True True True True\n", + "10 True True True True\n", + "17 True True True True\n", + "61 True True True True\n", + "5 True True True True" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(df1_c == df2_c)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97d6dd4a-7f3d-4459-bf42-46d0bd087ccd", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "35463772-bf3c-43b4-b536-cf4456b3f0f2", + "metadata": {}, + "outputs": [], + "source": [ + "from dateutil import parser" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "fa9c87f8-a42c-447d-bbb3-9c9d6830bd04", + "metadata": {}, + "outputs": [], + "source": [ + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "8d6f97a6-dafa-439e-9d9e-c37515be81bf", + "metadata": {}, + "outputs": [], + "source": [ + "pattern_dates = re.compile(r'(\\d{1,2}\\.)?(\\d{1,2}\\.)?([\\d]{2,4})?')" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "5a1a15e7-f9bb-463f-9c83-a12ff0f8328e", + "metadata": {}, + "outputs": [], + "source": [ + "dates = ['22.05.', '08.2024', '22.05.2024', 'hallo', '22.1250.25']" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "442beb19-06ca-46ce-9d64-2e6c632ffb3c", + "metadata": {}, + "outputs": [], + "source": [ + "string = '22.1250.25'" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "21e4a7c2-76f4-43bd-aeed-34e52ed53db3", + "metadata": {}, + "outputs": [], + "source": [ + "match = pattern_dates.search(string)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "3f5d75f6-58dd-43a6-abf3-80f581807554", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('22.', None, '1250')" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "match.groups()" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "306bcd91-8b87-47fe-96d4-cbc2a2bbad88", + "metadata": {}, + "outputs": [], + "source": [ + "dates_recog = []\n", + "for date in dates:\n", + " match = pattern_dates.search(date)\n", + " date_found = any(match.groups())\n", + " dates_recog.append(date_found)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "4e996e9b-8d75-4060-984e-ee439bfd5d45", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[True, True, True, False, True]" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dates_recog" + ] + }, + { + "cell_type": "code", + "execution_count": 55, "id": "91d4094b-f886-4056-a697-5223f157f1d3", "metadata": {}, "outputs": [], @@ -118,17 +700,1326 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "518cada9-561a-4b96-b750-3d500d1d28b9", + "execution_count": null, + "id": "0dabae5f-89b6-4457-a4ef-17cc33c6d561", "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "8830bbd6-ce01-475b-b492-455400319a9d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loaded TOML config file successfully.\n" + ] + } + ], "source": [ - "from lang_main.analysis import graphs" + "from lang_main import model_loader\n", + "from lang_main.analysis import tokens, graphs\n", + "\n", + "from lang_main.types import SpacyModelTypes" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 2, + "id": "ee31987c-9763-4952-8d83-bf9265430e74", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "A:\\Arbeitsaufgaben\\lang-main\\.venv\\Lib\\site-packages\\thinc\\shims\\pytorch.py:261: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n", + " model.load_state_dict(torch.load(filelike, map_location=device))\n" + ] + } + ], + "source": [ + "sentence = (\n", + " 'Ich ging am 22.05. mit ID 0912393 schnell über die Wiese zu einem Menschen, um ihm zu helfen. '\n", + " 'Ich konnte nicht mit ansehen, wie er Probleme beim Tragen '\n", + " 'seiner Tasche hatte.'\n", + ")\n", + "model = model_loader.load_spacy(\n", + " model_name=SpacyModelTypes.DE_CORE_NEWS_SM,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e086ee66-95c3-4fbc-bd04-a16b0fcdb26a", + "metadata": {}, + "outputs": [], + "source": [ + "doc = model(sentence)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "120c886d-6f2d-48e1-a300-8f39d9771204", + "metadata": {}, + "outputs": [], + "source": [ + "from spacy import displacy" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "30b5f152-be1f-43c6-8466-98de50a28443", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + " Ich\n", + " PRON\n", + "\n", + "\n", + "\n", + " ging\n", + " VERB\n", + "\n", + "\n", + "\n", + " am\n", + " ADP\n", + "\n", + "\n", + "\n", + " 22.05.\n", + " NUM\n", + "\n", + "\n", + "\n", + " mit\n", + " ADP\n", + "\n", + "\n", + "\n", + " ID\n", + " X\n", + "\n", + "\n", + "\n", + " 0912393\n", + " NUM\n", + "\n", + "\n", + "\n", + " schnell\n", + " ADV\n", + "\n", + "\n", + "\n", + " über\n", + " ADP\n", + "\n", + "\n", + "\n", + " die\n", + " DET\n", + "\n", + "\n", + "\n", + " Wiese\n", + " NOUN\n", + "\n", + "\n", + "\n", + " zu\n", + " ADP\n", + "\n", + "\n", + "\n", + " einem\n", + " DET\n", + "\n", + "\n", + "\n", + " Menschen,\n", + " NOUN\n", + "\n", + "\n", + "\n", + " um\n", + " SCONJ\n", + "\n", + "\n", + "\n", + " ihm\n", + " PRON\n", + "\n", + "\n", + "\n", + " zu\n", + " PART\n", + "\n", + "\n", + "\n", + " helfen.\n", + " VERB\n", + "\n", + "\n", + "\n", + " Ich\n", + " PRON\n", + "\n", + "\n", + "\n", + " konnte\n", + " AUX\n", + "\n", + "\n", + "\n", + " nicht\n", + " PART\n", + "\n", + "\n", + "\n", + " mit\n", + " ADV\n", + "\n", + "\n", + "\n", + " ansehen,\n", + " VERB\n", + "\n", + "\n", + "\n", + " wie\n", + " SCONJ\n", + "\n", + "\n", + "\n", + " er\n", + " PRON\n", + "\n", + "\n", + "\n", + " Probleme\n", + " NOUN\n", + "\n", + "\n", + "\n", + " beim\n", + " ADP\n", + "\n", + "\n", + "\n", + " Tragen\n", + " NOUN\n", + "\n", + "\n", + "\n", + " seiner\n", + " DET\n", + "\n", + "\n", + "\n", + " Tasche\n", + " NOUN\n", + "\n", + "\n", + "\n", + " hatte.\n", + " VERB\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " sb\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " mo\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nk\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " mo\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nk\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nk\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " mo\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " mo\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nk\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nk\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " mo\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nk\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nk\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " cp\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " da\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " pm\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " mo\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " sb\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " ng\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " mo\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " oc\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " mo\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " sb\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " oa\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " mnr\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nk\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " nk\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " ag\n", + " \n", + " \n", + "\n", + "\n", + "\n", + " \n", + " \n", + " oc\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "displacy.render(doc, style=\"dep\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "944b2da6-2c2a-4a58-b0ad-b2f280b7fecb", + "metadata": {}, + "outputs": [], + "source": [ + "sent = list(doc.sents)[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "ad8c1f0a-c46f-4b47-99d3-fa7e254ff570", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'konnte'" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word = sent[1]\n", + "word.text" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "189207d4-d0e1-4b8a-be8d-f5328f37c9da", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Ich,\n", + " konnte,\n", + " nicht,\n", + " mit,\n", + " ansehen,\n", + " ,,\n", + " wie,\n", + " er,\n", + " Probleme,\n", + " beim,\n", + " Tragen,\n", + " seiner,\n", + " Tasche,\n", + " hatte,\n", + " .]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(word.subtree)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "a83bdfab-4ada-482a-b0be-d093f115a6e5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ich:\t\tPRON\n", + "konnte:\t\tAUX\n", + "nicht:\t\tPART\n", + "mit:\t\tADV\n", + "ansehen:\t\tVERB\n", + ",:\t\tPUNCT\n", + "wie:\t\tSCONJ\n", + "er:\t\tPRON\n", + "Probleme:\t\tNOUN\n", + "beim:\t\tADP\n", + "Tragen:\t\tNOUN\n", + "seiner:\t\tDET\n", + "Tasche:\t\tNOUN\n", + "hatte:\t\tVERB\n", + ".:\t\tPUNCT\n" + ] + } + ], + "source": [ + "for token in word.subtree:\n", + " print(f'{token}:\\t\\t{token.pos_}')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "04194be3-7f30-4f02-a3ed-c2ca016652b6", + "metadata": {}, + "outputs": [], + "source": [ + "from lang_main.analysis import tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ea169167-f55e-4574-92bc-54aafc75ccc7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'ging'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "word.text" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "da9c5a7b-162d-4b99-b59e-f97bb765d08c", + "metadata": {}, + "outputs": [], + "source": [ + "rel_descs = tokens.obtain_relevant_descendants(word)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2fefb0dc-8285-4f42-9323-23b0bc9d8cc0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0912393, schnell, Wiese, Menschen)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tuple(rel_descs)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "397e088f-743b-4554-a695-65d0ddaac8ce", + "metadata": {}, + "outputs": [], + "source": [ + "tk_graph = graphs.TokenGraph()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "fd46701c-e428-43f8-80d9-979e96094bf3", + "metadata": {}, + "outputs": [], + "source": [ + "tokens.add_doc_info_to_graph(tk_graph, doc, weight=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "fc860f52-4bdb-469f-be8b-901bea39224e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "NodeView(('gehen', '0912393', 'schnell', 'Wiese', 'Mensch', 'mit', 'Problem', 'Tragen', 'Tasche', 'ansehen', 'haben'))" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tk_graph.nodes" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "56e49d63-7374-428f-a1b0-26e3d136ab9a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "OutEdgeView([('gehen', '0912393'), ('gehen', 'schnell'), ('gehen', 'Wiese'), ('gehen', 'Mensch'), ('mit', 'Problem'), ('mit', 'Tragen'), ('mit', 'Tasche'), ('Problem', 'Tragen'), ('Problem', 'Tasche'), ('Tragen', 'Tasche'), ('ansehen', 'mit'), ('ansehen', 'Problem'), ('ansehen', 'Tragen'), ('ansehen', 'Tasche'), ('haben', 'Problem'), ('haben', 'Tragen'), ('haben', 'Tasche')])" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tk_graph.edges" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "f0056e82-4ddc-4034-afc9-c25e3c2331b9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['gehen',\n", + " '0912393',\n", + " 'schnell',\n", + " 'Wiese',\n", + " 'Mensch',\n", + " 'mit',\n", + " 'Problem',\n", + " 'Tragen',\n", + " 'Tasche',\n", + " 'ansehen',\n", + " 'haben']" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(tk_graph.nodes)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "ee506f29-a6d0-47b9-a980-0227fa1d2a59", + "metadata": {}, + "outputs": [], + "source": [ + "tkg, undir = graphs.pipe_rescale_graph_edge_weights(tk)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "29a82ea9-6a66-47d3-bdbf-e41284785bc9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
1234
10.00.09520.74870.9830
21.00.00000.00000.8959
30.00.00000.00000.9538
40.00.00000.00000.0000
\n", + "
" + ], + "text/plain": [ + " 1 2 3 4\n", + "1 0.0 0.0952 0.7487 0.9830\n", + "2 1.0 0.0000 0.0000 0.8959\n", + "3 0.0 0.0000 0.0000 0.9538\n", + "4 0.0 0.0000 0.0000 0.0000" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nx.to_pandas_adjacency(tkg)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "de96d4db-0c98-4957-a91d-6d12e51fe2ee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'weight': np.float32(1.0)}" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "undir[2][1]" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "c802f550-5200-41f5-882e-a8eb780bacf3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
1234
10.00001.00000.09520.9412
21.00000.00000.00000.6864
30.09520.00000.00000.8661
40.94120.68640.86610.0000
\n", + "
" + ], + "text/plain": [ + " 1 2 3 4\n", + "1 0.0000 1.0000 0.0952 0.9412\n", + "2 1.0000 0.0000 0.0000 0.6864\n", + "3 0.0952 0.0000 0.0000 0.8661\n", + "4 0.9412 0.6864 0.8661 0.0000" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nx.to_pandas_adjacency(undir)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84c78e9a-8b34-465c-9bc4-13b38fa0cc32", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "58fe4954-ce69-4442-b6fa-504f1466b1dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tk.has_edge(1,2)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "baad206f-94ab-495a-8cc2-87a873220401", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
1234
10.07.02.05.0
27.00.00.03.0
32.00.00.04.0
45.03.04.00.0
\n", + "
" + ], + "text/plain": [ + " 1 2 3 4\n", + "1 0.0 7.0 2.0 5.0\n", + "2 7.0 0.0 0.0 3.0\n", + "3 2.0 0.0 0.0 4.0\n", + "4 5.0 3.0 4.0 0.0" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nx.to_pandas_adjacency(tk.undirected)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "996a303e-02db-496a-bd23-29c92d13d260", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "True\n" + ] + } + ], + "source": [ + "print(tk.undirected.has_edge(1,2))\n", + "print(tk.undirected.has_edge(2,1))" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "5dbe02a1-3883-44d7-836b-dd2c4d27f5f8", + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "Graph.to_undirected() got an unexpected keyword argument 'inplace'", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[52], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m filt \u001b[38;5;241m=\u001b[39m \u001b[43mgraphs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfilter_graph_by_edge_weight\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtk\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mundirected\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m6\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[1;32mA:\\Arbeitsaufgaben\\lang-main\\src\\lang_main\\analysis\\graphs.py:230\u001b[0m, in \u001b[0;36mfilter_graph_by_edge_weight\u001b[1;34m(graph, bound_lower, bound_upper)\u001b[0m\n\u001b[0;32m 228\u001b[0m filtered_graph\u001b[38;5;241m.\u001b[39mremove_edge(edge[\u001b[38;5;241m0\u001b[39m], edge[\u001b[38;5;241m1\u001b[39m])\n\u001b[0;32m 229\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m bound_upper \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m weight \u001b[38;5;241m>\u001b[39m bound_upper:\n\u001b[1;32m--> 230\u001b[0m filtered_graph\u001b[38;5;241m.\u001b[39mremove_edge(edge[\u001b[38;5;241m0\u001b[39m], edge[\u001b[38;5;241m1\u001b[39m])\n\u001b[0;32m 232\u001b[0m filtered_graph\u001b[38;5;241m.\u001b[39mto_undirected(inplace\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, logging\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m 233\u001b[0m filtered_graph\u001b[38;5;241m.\u001b[39mupdate_metadata(logging\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n", + "\u001b[1;31mTypeError\u001b[0m: Graph.to_undirected() got an unexpected keyword argument 'inplace'" + ] + } + ], + "source": [ + "filt = graphs.filter_graph_by_edge_weight(tk.undirected, 2, 6)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "3d9ecb28-23ef-48ac-9cee-86ace6be7af1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
1234
10.00.02.05.0
20.00.00.03.0
32.00.00.04.0
45.03.04.00.0
\n", + "
" + ], + "text/plain": [ + " 1 2 3 4\n", + "1 0.0 0.0 2.0 5.0\n", + "2 0.0 0.0 0.0 3.0\n", + "3 2.0 0.0 0.0 4.0\n", + "4 5.0 3.0 4.0 0.0" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nx.to_pandas_adjacency(filt.undirected)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "42345f27-585f-4498-a4cc-50d17c9f9b69", + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "too many values to unpack (expected 2)", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[54], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[43mfilt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43medges\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mweight\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n", + "File \u001b[1;32mA:\\Arbeitsaufgaben\\lang-main\\.venv\\Lib\\site-packages\\networkx\\classes\\reportviews.py:1095\u001b[0m, in \u001b[0;36mOutEdgeView.__getitem__\u001b[1;34m(self, e)\u001b[0m\n\u001b[0;32m 1090\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e, \u001b[38;5;28mslice\u001b[39m):\n\u001b[0;32m 1091\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m nx\u001b[38;5;241m.\u001b[39mNetworkXError(\n\u001b[0;32m 1092\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(\u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m does not support slicing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1093\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtry list(G.edges)[\u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;241m.\u001b[39mstart\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;241m.\u001b[39mstop\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m:\u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;241m.\u001b[39mstep\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 1094\u001b[0m )\n\u001b[1;32m-> 1095\u001b[0m u, v \u001b[38;5;241m=\u001b[39m e\n\u001b[0;32m 1096\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1097\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_adjdict[u][v]\n", + "\u001b[1;31mValueError\u001b[0m: too many values to unpack (expected 2)" + ] + } + ], + "source": [ + "filt.edges" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17e7c931-d94e-43cf-ac97-bb6fccc1ee70", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "7dfa028e-d2e7-4390-bd36-b08b0a591b22", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filt.has_edge(1,2)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "1b6e6938-1546-490a-9b64-e3d2f60d188d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "filt.has_edge(2,1)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "518cada9-561a-4b96-b750-3d500d1d28b9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(1, 2), (1, 3), (1, 4), (2, 4), (2, 1), (3, 4)]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(tk.edges)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "9830c614-5c16-41fd-8987-be3d421da34a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'degree_weighted': 14}" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tk.nodes[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "42b2bb65-534f-4c9c-b439-d5eec4b285e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'degree_weighted': 10}" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tk.undirected.nodes[2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c937e70b-bd89-4c3b-aa09-5f0a63982c13", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 13, "id": "3235f188-6e99-4855-aa3d-b0e04e3db319", "metadata": {}, "outputs": [ @@ -144,7 +2035,7 @@ " 'total_memory': 448}" ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } diff --git a/notebooks/test.graphml b/notebooks/test.graphml new file mode 100644 index 0000000..58011f8 --- /dev/null +++ b/notebooks/test.graphml @@ -0,0 +1,37 @@ + + + + + + + 14 + + + 10 + + + 6 + + + 12 + + + 1 + + + 2 + + + 5 + + + 3 + + + 6 + + + 4 + + + diff --git a/pyproject.toml b/pyproject.toml index 9df6b04..f40a111 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -118,6 +118,8 @@ exclude_also = [ "def __repr__", "def __str__", "@overload", + "if logging", + "if TYPE_CHECKING", ] [tool.coverage.html] diff --git a/src/lang_main/analysis/graphs.py b/src/lang_main/analysis/graphs.py index ec0af65..3ebce80 100644 --- a/src/lang_main/analysis/graphs.py +++ b/src/lang_main/analysis/graphs.py @@ -198,8 +198,10 @@ def filter_graph_by_edge_weight( graph: TokenGraph, bound_lower: int | None, bound_upper: int | None, + property: str = 'weight', ) -> TokenGraph: """filters all edges which are within the provided bounds + inclusive limits: bound_lower <= edge_weight <= bound_upper are retained Parameters ---------- @@ -216,12 +218,12 @@ def filter_graph_by_edge_weight( original_graph_edges = copy.deepcopy(graph.edges) filtered_graph = graph.copy() - if not any([bound_lower, bound_upper]): + if not any((bound_lower, bound_upper)): logger.warning('No bounds provided, returning original graph.') return filtered_graph for edge in original_graph_edges: - weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight']) + weight = typing.cast(int, filtered_graph[edge[0]][edge[1]][property]) if bound_lower is not None and weight < bound_lower: filtered_graph.remove_edge(edge[0], edge[1]) if bound_upper is not None and weight > bound_upper: @@ -329,14 +331,12 @@ def static_graph_analysis( Parameters ---------- tk_graph_directed : TokenGraph - token graph (directed) and with rescaled edge weights - tk_graph_undirected : Graph - token graph (undirected) and with rescaled edge weights + token graph (directed) Returns ------- - tuple[TokenGraph, Graph] - token graph (directed) and undirected version with added weighted degree + tuple[TokenGraph] + token graph (directed) with included undirected version and calculated KPIs """ graph = graph.copy() graph.perform_static_analysis() @@ -559,12 +559,12 @@ class TokenGraph(DiGraph): return hash(self.__key()) """ - def copy(self) -> TokenGraph: + def copy(self) -> Self: """returns a (deep) copy of the graph Returns ------- - TokenGraph + Self deep copy of the graph """ return copy.deepcopy(self) @@ -669,7 +669,7 @@ class TokenGraph(DiGraph): return token_graph, undirected - def perform_static_analysis(self): + def perform_static_analysis(self) -> None: """calculate different metrics directly on the data of the underlying graphs (directed and undirected) @@ -717,16 +717,11 @@ class TokenGraph(DiGraph): saving_path = self._save_prepare(path=path, filename=filename) if directed: - target_graph = self._directed - elif not directed and self._undirected is not None: - target_graph = self._undirected + target_graph = self.directed else: - raise ValueError('No undirected graph available.') + target_graph = self.undirected save_to_GraphML(graph=target_graph, saving_path=saving_path) - # saving_path = saving_path.with_suffix('.graphml') - # nx.write_graphml(G=target_graph, path=saving_path) - # logger.info('Successfully saved graph as GraphML file under %s.', saving_path) def to_pickle( self, @@ -743,13 +738,14 @@ class TokenGraph(DiGraph): filename to be given, by default None """ saving_path = self._save_prepare(path=path, filename=filename) - saving_path = saving_path.with_suffix('.pickle') + saving_path = saving_path.with_suffix('.pkl') save_pickle(obj=self, path=saving_path) @classmethod def from_file( cls, path: Path, + node_type_graphml: type = str, ) -> Self: # !! no validity checks for pickle files # !! GraphML files not correct because not all properties @@ -757,7 +753,7 @@ class TokenGraph(DiGraph): # TODO REWORK match path.suffix: case '.graphml': - graph = typing.cast(Self, nx.read_graphml(path, node_type=int)) + graph = typing.cast(Self, nx.read_graphml(path, node_type=node_type_graphml)) logger.info('Successfully loaded graph from GraphML file %s.', path) case '.pkl' | '.pickle': graph = typing.cast(Self, load_pickle(path)) @@ -767,17 +763,18 @@ class TokenGraph(DiGraph): return graph - @classmethod - def from_pickle( - cls, - path: str | Path, - ) -> Self: - if isinstance(path, str): - path = Path(path) + # TODO check removal + # @classmethod + # def from_pickle( + # cls, + # path: str | Path, + # ) -> Self: + # if isinstance(path, str): + # path = Path(path) - if path.suffix not in ('.pkl', '.pickle'): - raise ValueError('File format not supported.') + # if path.suffix not in ('.pkl', '.pickle'): + # raise ValueError('File format not supported.') - graph = typing.cast(Self, load_pickle(path)) + # graph = typing.cast(Self, load_pickle(path)) - return graph + # return graph diff --git a/src/lang_main/analysis/preprocessing.py b/src/lang_main/analysis/preprocessing.py index 69dac81..dcebabd 100644 --- a/src/lang_main/analysis/preprocessing.py +++ b/src/lang_main/analysis/preprocessing.py @@ -205,6 +205,30 @@ def numeric_pre_filter_feature( bound_lower: int | None, bound_upper: int | None, ) -> tuple[DataFrame]: + """filter DataFrame for a given numerical feature regarding their bounds + bounds are inclusive: entries (bound_lower <= entry <= bound_upper) are retained + + Parameters + ---------- + data : DataFrame + DataFrame to filter + feature : str + feature name to filter + bound_lower : int | None + lower bound of values to retain + bound_upper : int | None + upper bound of values to retain + + Returns + ------- + tuple[DataFrame] + filtered DataFrame + + Raises + ------ + ValueError + if no bounds are provided, at least one bound must be set + """ if not any([bound_lower, bound_upper]): raise ValueError('No bounds for filtering provided') @@ -228,7 +252,7 @@ def numeric_pre_filter_feature( # a more robust identification of duplicates negating negative side effects # of several disturbances like typos, escape characters, etc. # build mapping of embeddings for given model -def merge_similarity_dupl( +def merge_similarity_duplicates( data: DataFrame, model: SentenceTransformer, cos_sim_threshold: float, diff --git a/src/lang_main/analysis/tokens.py b/src/lang_main/analysis/tokens.py index 6b35b45..f9009e7 100644 --- a/src/lang_main/analysis/tokens.py +++ b/src/lang_main/analysis/tokens.py @@ -11,6 +11,7 @@ from lang_main.analysis.graphs import ( TokenGraph, update_graph, ) +from lang_main.analysis.shared import pattern_dates from lang_main.constants import ( POS_INDIRECT, POS_OF_INTEREST, @@ -38,21 +39,40 @@ def is_str_date( string: str, fuzzy: bool = False, ) -> bool: + """not stable function to test strings for dates, not 100 percent reliable + + Parameters + ---------- + string : str + string to check for dates + fuzzy : bool, optional + whether to use dateutils.parser.pase fuzzy capability, by default False + + Returns + ------- + bool + indicates whether date was found or not + """ try: # check if string is a number # if length is greater than 8, it is not a date int(string) - if len(string) > 8: + if len(string) not in {2, 4}: return False except ValueError: # not a number pass try: - parse(string, fuzzy=fuzzy) + parse(string, fuzzy=fuzzy, dayfirst=True, yearfirst=False) return True except ValueError: - return False + date_found: bool = False + match = pattern_dates.search(string) + if match is None: + return date_found + date_found = any(match.groups()) + return date_found def obtain_relevant_descendants( @@ -106,7 +126,7 @@ def add_doc_info_to_graph( if not (token.pos_ in POS_OF_INTEREST or token.tag_ in TAG_OF_INTEREST): continue # skip token which are dates or times - if is_str_date(string=token.text): + if token.pos_ == 'NUM' and is_str_date(string=token.text): continue relevant_descendants = obtain_relevant_descendants(token=token) @@ -252,32 +272,33 @@ def build_token_graph_simple( return graph, docs_mapping -def build_token_graph_old( - data: DataFrame, - model: SpacyModel, -) -> tuple[TokenGraph]: - # empty NetworkX directed graph - # graph = nx.DiGraph() - graph = TokenGraph() +# TODO check removal +# def build_token_graph_old( +# data: DataFrame, +# model: SpacyModel, +# ) -> tuple[TokenGraph]: +# # empty NetworkX directed graph +# # graph = nx.DiGraph() +# graph = TokenGraph() - for row in tqdm(data.itertuples(), total=len(data)): - # obtain properties from tuple - # attribute names must match with preprocessed data - entry_text = cast(str, row.entry) - weight = cast(int, row.num_occur) +# for row in tqdm(data.itertuples(), total=len(data)): +# # obtain properties from tuple +# # attribute names must match with preprocessed data +# entry_text = cast(str, row.entry) +# weight = cast(int, row.num_occur) - # get spacy model output - doc = model(entry_text) +# # get spacy model output +# doc = model(entry_text) - add_doc_info_to_graph( - graph=graph, - doc=doc, - weight=weight, - ) +# add_doc_info_to_graph( +# graph=graph, +# doc=doc, +# weight=weight, +# ) - # metadata - graph.update_metadata() - # convert to undirected - graph.to_undirected() +# # metadata +# graph.update_metadata() +# # convert to undirected +# graph.to_undirected() - return (graph,) +# return (graph,) diff --git a/src/lang_main/constants.py b/src/lang_main/constants.py index 4f27ccf..7b7f50c 100644 --- a/src/lang_main/constants.py +++ b/src/lang_main/constants.py @@ -43,6 +43,9 @@ LOGGING_TO_FILE: Final[bool] = CONFIG['logging']['file'] LOGGING_TO_STDERR: Final[bool] = CONFIG['logging']['stderr'] LOGGING_DEFAULT_GRAPHS: Final[bool] = False +# ** pickling +PICKLE_PROTOCOL_VERSION: Final[int] = 5 + # ** paths input_path_conf = Path.cwd() / Path(CONFIG['paths']['inputs']) INPUT_PATH_FOLDER: Final[Path] = input_path_conf.resolve() @@ -91,12 +94,7 @@ else: STFR_MODEL_ARGS: Final[STFRModelArgs] = stfr_model_args # ** language dependency analysis # ** POS -# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX']) -# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX']) -# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN']) -# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX']) -POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV']) -# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB']) +POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV', 'NUM']) POS_INDIRECT: frozenset[str] = frozenset(['AUX']) # ** TAG # TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD']) diff --git a/src/lang_main/io.py b/src/lang_main/io.py index f5b3af4..9b985eb 100644 --- a/src/lang_main/io.py +++ b/src/lang_main/io.py @@ -4,6 +4,7 @@ import shutil from pathlib import Path from typing import Any +from lang_main.constants import PICKLE_PROTOCOL_VERSION from lang_main.loggers import logger_shared_helpers as logger @@ -39,7 +40,7 @@ def save_pickle( path: str | Path, ) -> None: with open(path, 'wb') as file: - pickle.dump(obj, file, protocol=pickle.HIGHEST_PROTOCOL) + pickle.dump(obj, file, protocol=PICKLE_PROTOCOL_VERSION) logger.info('Saved file successfully under %s', path) @@ -56,7 +57,7 @@ def encode_to_base64_str( obj: Any, encoding: str = 'utf-8', ) -> str: - serialised = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL) + serialised = pickle.dumps(obj, protocol=PICKLE_PROTOCOL_VERSION) b64_bytes = base64.b64encode(serialised) return b64_bytes.decode(encoding=encoding) diff --git a/src/lang_main/pipelines/predefined.py b/src/lang_main/pipelines/predefined.py index 0e073a0..22bb04d 100644 --- a/src/lang_main/pipelines/predefined.py +++ b/src/lang_main/pipelines/predefined.py @@ -5,7 +5,7 @@ from lang_main.analysis import graphs from lang_main.analysis.preprocessing import ( analyse_feature, load_raw_data, - merge_similarity_dupl, + merge_similarity_duplicates, numeric_pre_filter_feature, remove_duplicates, remove_NA, @@ -100,7 +100,7 @@ def build_merge_duplicates_pipe() -> Pipeline: }, ) pipe_merge.add( - merge_similarity_dupl, + merge_similarity_duplicates, { 'model': STFR_MODEL, 'cos_sim_threshold': THRESHOLD_SIMILARITY, diff --git a/tests/Dummy_Dataset_N_1000.csv b/tests/_comparison_results/Dummy_Dataset_N_1000.csv similarity index 100% rename from tests/Dummy_Dataset_N_1000.csv rename to tests/_comparison_results/Dummy_Dataset_N_1000.csv diff --git a/tests/_comparison_results/analyse_feature.pkl b/tests/_comparison_results/analyse_feature.pkl new file mode 100644 index 0000000000000000000000000000000000000000..359919b0e234c34651e35212a15bda81f09ee170 GIT binary patch literal 25949 zcmeI52bdGp`iGY;DrM;l>k?29kgft&WGSLUlT}nK>tuJb37fKJ63W`JfT*$DYwQIJ z_TC%TwfEk8xxM%9{m#sLlF24F}e zV`kiJvXXW-vu$lO(a=2KJFm|JC#f~q?BbB$1KxraqKR#DeKi$Vy7IQRjc^wqeE2unq>{a<~@$14cWzzpRE&!k@rD!tNbi zy8-Y}crrW%-UT0pU%>4JElYa1}fsUI0IXgQ@=Ra4&cbd;@+6KZn1Eld1mE@Obznd;$I( z{tkAdy0vf?yaj#+v*8iO;5FIsXkF1~5#9q%g$Kbl__pl#AYvBH{37!nsz^CAwP*t$1R0X?IfdOzIn1;8(C*fEsI36AZkAP>uv*E4q zX*ie)O@Z^_-SA!bXE=ZgOoGecweVh8RKdp(JQ7ZUhrw&%oA5pO0sIL50`5QsA}|Rv zFbg{)yf+*NC&5MVGx$$9m5N*mKlfB@khkY<*Mi^^@Ok(ed>?LyusZl6+?9g&f&0Qn z7>3ut`{0k^ui@8lG=h(Xr^6dO6&c{I$n^-^8XgDVg`dOT2(E%7;NI|17=n+(S78+b z7s4g51^xp53H}|9MEGn^1^Sn&KwpF|fLFk);f?Ss_*?ilxD`TAgr~yGU>5}K0QZM0 z;3aS{Lep>~dLg;2AIjpM!71N`x+j zhrvd_3Uonm4V(;@z!mT!s8jFf9RC{jr2-eg+u*Zs4nnVnRS265>*0CuHuyL!>VYdL znD`abIo<$og-3r`oIW$6^=sCp)d$r;1A*5u&i+6Ra7FBZGXk-9N!DyhJS!V5PC7Z8eR|g zBk-m0VfZ{8gs_v~neaOJ2rMgnCPEH?tKlc`NAM@`UvK~dYhVzj;Sb<2gdPCbz;*CS zeIB6~!YBNMk3q<6Xux?e122V_L%my`iNG`AdGJ#B9Q+veN9Y822)qm)kKpUe3BM2l zLtr)B9sU6R5*|Rohr<-y0N;Z@gFO*)8hil$3|1rTEciBj&m*|6cQ(<2ry(c|Tj0s? zRQOl8BZ7Vn{{}xq$Uoq=6g(cD2+xLB!&~9U@KdOAx}@Me5V8oq1AhmnA!s%<;PLQg z_-FV}xDCRm!x`{!_ybJ@-U*-a6FdW9F?bGq3U)z&4Nrm(!;usmf}JVQgbi>tTnjIU z55TA4SFosswXm@Wn*b-l5Ihau1-C)yXm}2siC_z^g!jYY2ssFz0Uv^2Kqa(~H+Qwr z(FiyXz6`fTzsfxQq^2iL-1!C?s53GMeyb0DKXfE6zTJQ!q4MFF^XJAJJ^n_#Je0TyZD{wqQ z4u?m=+u`HzQ}|EVgs@-2f5E*G7K87=qYx5?$H6Dd30zG`K9-;BoMmuopsX zkDy*&K`j(;H+&AZQNXA0Oak5nUx&YiDGIn0Zh*JLHuw@$In~oEM6by^BA^-m4E`NH zf}p41b_kdSZ-qlC$bcbufk!|OuYkD}Fb-Y@uce?31pW~mL7=1IgYXqk0o}a?=-tO| z1Wv%|6mS=uMu2nSm!5##N(FqFKpj!m7tVw4!Joi?LDdH(1MWh=Iq(hm7uX%8*AnPC zKZQ$i{1fa_?4f0d87Xh;HIyjO*`=R6%KShIZ zd<70Azz`Jd1RwSbG>QNR!$aVn1ULrX2vwq%Y$H8pKP13UQPhm%PvJ~IMMvRyK3t09 zBk&b?U%5cKO3$F=5(51gR^T|$6QGMXe{_HY2{4utzk;ne-sl(LX#!N^xHk@u!uS0G zEF{1W;YCzo9~{r|3-Az*2jch<+z~|w`UQA|0BdkGaBP6fp~{PrjW`g;J>W)oB#w9Z z1z3-Q3LM{pMmE46lT*pr8kiFL)d~dnXd@xF3#}<9H5^Q9sACar_64 zaU7q5YSfi1@e&-5#_>jYmtTqVaqNJ@9+Y&LUy1#2T!7;)IBerrVjmo~!eKT1gI|gF zQQ}z`J_dXEm3Tjn+v0E(yvEP*$2j&x@`E_8@^kEl<5f8R4*nIE?Z;DayaU6FDDio} z61T%~Esi(A6@HHWFkB9gf%n6*Qd=qUbPO|;bdO(&FX1=|$2;Nfevb7RHoy~UoMZeP zM^K`P<8N@R^K*>g_#(U<$E=^@u{eyu;aIqbpQDN6#qe492c(znj-xO<1~%ewx1Xcl zJ>QJPToQ`8jmO;b3@(pTk?! zVGa&oz_P8h5{Ed3z5Pmf0f#7cc-GHh84i^=yyoY?`HH=9d=!?g!?qX%U|+uy`e1NA zyu#067?FPp@9=Zj9>Y~M>#2SYJK^vs4$s4~m9E9%8aRUHE$grt!zf(oC+ScOKZe`; zmGCl}Pr~q2KZiJx*W>uCpM#CV6dVroYo!x$_&bKf{Yp3khjlm{?AJ43N%{Dz`RI9IAu44lV-fxif1$1wOtytk}b)$ zjiGeB$uX&uTfXCthn%k|K0ixt;%;7U#Z!@}+-*LjFn(P_I2MuH$|Ef;Ry5nbbk5qV z?3AT$Ro`yExZPqkry_~CQ}mp+dy}Fv*tQ`q$YusDwj%LJlU6n_9ZH2FQMqkBn+ddY zmC)%_s~K#x8k)sk)vdZ%d)xU@JLN1d=mn;B7jR*F~=i7PdBW7~#U zi@K3KQ+=RiryTK%t*9BX1L=5UKrGufrdp+j!;8Z#id*TJnTq7waED?eqith28jGY- zRy5aICHi8C(+y@yZvU4GZ(kheKr7Z_CCyYi8B!6I;(qi(u2t2&|0SJs&C$9l8gYAM zaA9}PQ@+7;L)cd3R~e>!O&}@#uVdF)aog2am20O3iPX3mX?vW=Fx|eaDGa(qzKYNw zS|e@iQN@l6vnBwNGli_3~!&W3^FHa=pdj{bY!G;&sNh-TSRcLuZiTOsUH6?9Z#?_kA zkhGID(R|CcSIC!4oYbi@q>>3+4Odg0N|{mVLFpn9 zGa(nc(VZZY(X<*FPBB5~%9MVvB^J`x&R2IlW6zx%WTs6! z<1w4rd7c%qResvKaJO?*ldCLJ-EAbzkaR>M6w(uIZLVS5tO}Uvw)IkMt>q*m&7@*T zqp6n9+vkh!;|f+Q(9(2M$Z|Ssa#6ikNN2?Z&i9F=A~F&rU(`C|sl=Evl^|gQOWQWa z8=I|UtIEmUSgYKomJw3ttUPG;wr@P37MJEktR*Q4k&uKKlyBpL9+YoM4PQd%sruxq z>!iKRT22#7e>meeYC4BbiqAFEEh*bc8z;SMoMcUgrNym4exlYHo6QWXwIu%~wew4$ zVFs;e#7;!rj+SEPk**PU3C+n7C91EpnTbWs+-``^Y1_!)l5}yqG5&7AKY4?hS?9Iezdgcg}JrPSx<9wjS%PMEgY0xigFTTUynPrDnOp zKF(Q;lnJ5qOtEh|7K=F3ozwhN4^pd(nYP@iRHb#v{8JNBEFr5_s&p^pCrjIHId?eP zG9C-M#Qlz*bckC8>4nkzx4=jT)V5X@mZD{^&Y6l-uF8}tHOP;)mSiFrmCb^#f?7YmVMtD{^Q=TN z`e($WMQcSoP-i8VC&p1(nTA~T&ZZ+!my%7cWJazZg1JF1nNc*pryLY-W(Uud=vCld zs{^%3D`v}@ptd8jc$OG@`I$bD8$hZVWq?YnFo1N@yCWvvYDJplWcw!9JxP!3QzX-~ zb{B0cN)FK}^l?1Wtkc6y)WS?~E>EQW^1G7J`JthfrKly7kD41)X3R(ir8R6xB};~G zHdEVf9>~vH-j#s~!855fsy&S!nAK{K=zW)(ALck0C8m=L(o}jSmj28&bAF!ctJgw1 zFxQgV+)Tz~Q0->O+7p$HU9RpD*G*ehm4Ydo*K_IIJKu64Cx6v$zWwy1w_tI#m9b^_ z7s_TP&0%jSdp|cTDQK~5T~zbsnkRn+P?c|OnIKf6n`}$vdMGU6J3UNH^-sM)#UTUsdPUJ?Y89vFRj`+AcY>Y~jvBa-nD|#|Bb{YFt)(8I99D zQ!v}IoFKYTjdin0&7EG`Tod)uivq1Gi*>FVZp6ao)@x+Wh^=VBCO&u4vtb+cjT6D_ zc%#&c>%gjSNV{pCO+lfuxLT6~Ny}`K2~zDq?ZAAqDQ=~7Z;Z>yaz-&b8MhnrGosC2 zQ#MS_zRbOdkWp{CQ()l`l;zbsBMfp}SU0>^!P(3}y+LCyk-v~|cZm5cu>%Ln`ela_ zNqKu9p;p>S@+L>h49W$T4Edy7rlpew1JvD})NySYH^VaT51OGbL!==*A|>3cf+az% zyRAAb7hr zw}wsk_L`o}oOReHXu4l)A9Afw6{F0TcQVe)#w=qDU3 zuT>e_88!0Ca5)+!uT{xWjWN{NNq(Ckzl}Gh8Y7L}jTuIzyt=cnk1^R8ZR}+ zzSUCFK+!+J=r0jeiK@;jN`HR(iM9UeD|*yVALpl1>Rc)Ddl@~A9!7VgTkfZ;Jom)= z)5Y-n>1=GyPbcHM{d6?^emcmrTli6r5z)7L@Bv8$A9LtiaGg&pO5Rto#X&yM;RC)c zt>iqCW=k$fN^BWG(g^A^&MmXXmhtfaWjyE`WAvivT^9;&I@#=$(2+|`E8i{hx0dq# zS8}tZY_@}LMy*{pXVEPs+E5#Oe?7U`nzuM?%JiXc@$q(4?)DnB!`9@X9=3r4jz(Cu1qSDkm*YlH7^ zH0w%eZDk{CLthEn&^O|@o)>&0Z*qJ3M%v`cXhYpRI`Vf#fhaRR8L-yl1qZwsNE8Bj{uI z6+YZhA97go-1N*=!L;1|6N$@br(JBhD_A|N-l&jIaXP3+)f*kvziYDU!SsU)AHUyd zt+ed%LDQBGF1N0aTPy1Klba4!iu;4uTL~Iq~-3p9;V#k+EAYeE)!F+ ztW@U;UvnzfIj;Ucfd%QcQh{XyMbWY<`A5Vo8zNMD;6r#Q#AvyYR(}S#sOSN^LjB|h z{hjo!dA$pHo2=6HLkpWHQ;WBn{N{Q$ucygnT2Z$a+Eu^z}f zay$g~g>^6hN5XMXKT>H7$9upS><;zwWb}7=Ci1#|biy!>cY!H*1k_KfI-KLF@DO+y zY=(9`4E=<(19*K`I3F&AQ(zw$h6lkVFbMTi5%%X; zZ&UQQ?DaFR>N#)28aM;)3#UWUkE*p0R}+(va=Hq*5}B=_k$zYdfyR3ui|^^NHknXJu} QRUcLN#n{<(b~-rz->A3Ll>h($ literal 0 HcmV?d00001 diff --git a/tests/analyse_dataset.xlsx b/tests/_comparison_results/analyse_feature.xlsx similarity index 100% rename from tests/analyse_dataset.xlsx rename to tests/_comparison_results/analyse_feature.xlsx diff --git a/tests/_comparison_results/merge_cands.xlsx b/tests/_comparison_results/merge_cands.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..e25b347bc3e77d740adb46166840951d5c5ac2d7 GIT binary patch literal 5989 zcmZ`-1yq!4*B!cJXz3Q|80p5LQ;=422ubOXbbz5dlu%+2X{0-b?vU;h5RgVe(0{zH zZ{2(Ozi(!}@2q#N+2?)EUi+NqJX>204V?@C0AK^s9h{96`!tkOkxwJYix_!XyI5(v zxwyLXS-86L`ZzgijA`I?@#ABzwLkW%e;q|%K=?>1DyLrJ@RKP%whRAc84Gt5)7aiWH zuiX2@t7z*vmL~80QCx=|v0(@jue#4rvP6qZyyM3v9Zjkxmd=IoD_VDmTsPu>S_ppp@Lz)|{`-X`0Ps^jMomU0 z0i13g)fS$-D{ae9UssdU%FsGvt!|_P9l6SI<*#5`6Ie0U{AG%Z&8&KJm1sJ|02R9N zK|PzUZP%Ur)Rea`h~$dB8l+zEM@GG5}Z!U z%WYA!Hx@2t)GzQlj5<7*c19>&WaIey`>`aS8CANncvz`JEeU5HD?{^Mo-6GSiZ=4a zse=mCFHdq9cd1+vqXR?Nm(KoUO-;p_p=Y5YCeNa;6+yEu$@@ybz;>;|5@$r4M&q@| z`>Rb_|0d?OX%-7HCIFCi9{?al6652@=V1qNg8aD){HEs6*u;H7lq~S5V%*2}(h09u zIHazI)ZN}|vF0`OP?I4HXc+70Yn=!UYF82_9!euPOwwq|ltky=`!Nbu+SwKLJ^RXQ zxf>2kIrTB0=UbdGZ172%JJWOUvb=l3s-+uYitidhz1Ohk|35sqp{U|$cAQ2 z*G#DSSZ2P|EH3Lq?HzNel&sAIo^jjBj_c_dyO=T7Y%>UbCm+&_h2*1<8pyFX!fH;h z9D8mSR-_?gd*XD0A}bv#Ip@Yy_M$IHTIkSkPD?yrk;Umm zhm;`u5|sCc&6#7#H}1PzZ{H`+YRHeews7N|{9rM$h`8@8LFy(>Yb*b9?D8@Z&L%i^ zbcMUGDQU}~pH}&FY-}qh#*L+U`O+}pQcKC1W%ffyHS=tWA}jc9KTp#ZP_$NsK#)`h zJ3Bn%xi&3o0ggagZus|_-csNiX+V7&cK>QD>|2S;8!3ZaLzM;ScVb2AjV$-i?h}qb zu;7Xlmrq)j^3x5;QPQbaLB$WbduDu6J5I{7BnP>me>}A`Km?Elsr|6V$%> zn7)EF?oC-2kV{_L@}4lYTP;OM-HQ)Ub7PX;2_MbL8sLpl>BD4c^ZtX8M9spB`b4R^Ma2GIW={}= zmH);FrxmLI{B3arNV3h{)YxP0oh#RBKo zO;>23mTyCIYRX;`7tP5?Y{Z>wk+6)TF>8QMO8!==*wM6?|&R7a9d)w21y?}esr1Bq81 z5vpKgv)Bheh=6V3=_(IC4z~%+h-5?K>EZNFW>&c$v)6UHcc=O3t?Pgi(YC~n+4)n2 z^1#dG&1>gm31#&x-yR@}2N;zll@^nNQgBQAYf=IxDauI7z@uZ-Q0qe4;EZd$)>Hh( z2jrt=(qchi?pNWO&nDSO$f7FoF_ypL@s5UUP>yLG=W!rzr;&B0Hso+!W^Y z*v`rq*zv*^H}poDsoU7i)ewHp**y1uEk64sRYNJIc$WWqjlMdx7##&Byc zpo|*UONJy&s~D4WkXbyLWm#cWS?_`XzHhMM8Gf#f=xPM`4Ykm8bg5V@u8Z8X?6%C} zCjz@m0uXH?#xsG}b;9{unz)inKaU71ySn4n52M>r{=I<=2YMWNVgLXx$ZeJC&j#Y{ z;^yFP2Z4CF^ZoVn*Dey4F%-8bOcwqPE}mx*;_O5wVsfjhRQO^+OiSA3EU%5KtaY`O zK7PMv!D;IHNuxW4pFX5kLyEWL$BWhy(0 z0)M&n%G5Wr;pJoCQ}j4@+tl@HzZ{7@@%X5Av|M&3b7&qs^g?1(aIqi(I2|CKw5jS%0EMuSg#3l&@EgY@PwBC7&X$p*Fb9Se?z)4sE!d>0pj zw1^!(#3~3SIOwd#D;@Gvt?qn3I`Lj2+ywq(DTtF{wQ)yfz=ii2Tsm8`_%u<^qHbz{ zIcvCUF>`+Nq!hcV>4Mx(sAWuR=Y!Jsbnp37cd2{+-$}p5Q;yA~BWB8vr_<@HODw9F z$fXo#5sE#5{sz1hKZ^U#ZRS8_`))4`MuYl5KWwfpZ!)pMsaUq0mgd!#Rx4i#Aff^? zaK-P*(6)H>GV|S}DyzcQp@s?^;LT9}0c?X%R~A@DBY${5lw@Fhyu!5{T?x?$e{}!m z#}VvvS_agLKgtFMd_(0F8I}r3GR#?H+o+em6!;d%5z;#B9)V$$NHlMGn7hu&Biw3Q zF(-$gAr{wpp}nI4l1*Wx3eaqqqK&Ud&CxTFm5pG2leqaGUh>i z)BLqZ4}rU+xJqRmc*CV^j|K9Mvrq}LGdf_Zd7WT2xtg>!)^V~b)ELJj5TBK^)UA~F zbh+QPOD*458tMAx8V+jHR@d+ATb@{>I`6-9L@KfqM!bz+F&5oGC)}nJWyXz=mwYV$ zC`lWSgBUYH7G0hb==^|*-U)^(3YW*rs{){s1?O9oXrT3BPVmDpQM*Vf@(nT8GYTlx zhvB;X2GfC6zOuL@ZnH{?_yeDdbiMi?AxQ4Cj3y`Y-1C($tQNKr#0}>_eZ9t*Fb~QV z2Rsf=e%C1h*yti5w4+_8=+{DRO3;qhi&C~iiNfH=zfP33A+@yYe~vnjuW|V*ve6@J z#F=v6>2opj;-Tq^5}0sryHa*o{+69OQWW2lKHS-2cWNuu!+h80RQ@4OD!QCak)S$S zREQOY3ch&*fQpr$2{r2-fd(sQ+oR&g_zxeULqG#+xu?J6?Y zuF~c{g&SEEZdB|e%p_Pdh#2$I825>p2@pNxeJPYmJ%}y7aU6(7GPUyS^=P_t6-vfW zOJ_pFl+o#|4Gat-enIhB5^wQi<$JdLW|}%33S~hMD@TgOs2Fh06Ni3ACeECRjOtyS zGAp}1LX)bQi!O)<>aIe0q%5 z9QQaV8{Q|j;(tunS#R!^=cBkO&1+@Nlg`TX`35msuNGZA4}`I!(ZKgNS9qg+XgB!d z6b~k;7+$`QU;xS#P6z0JV)f>lW32#v3AngG{M--Y^Cl?v-3w@R(wyb(DL|m@nx(>B z_`2f5E*XZ-gZ|yBzg@=TyhjB9obUhufzFnV%^>*R7aY0K)8Ha^$vPM`?gWSTB;#`E1qtLSB&FB z1m|Jaz6@l(a=*b6fi>k$pzKg;NtovKmY{XJ^j<=NbzMs{mtYT6Q^Y~4*nd0R+E($f z_?%bc<4Ds(zsEdN?}A&^PqtnUa4rizc+S;jQwWvByq$FLQl!olg@qgJb(MstS6Tqm zsh-!sp~5o`)obi8XA@l`zxM$opeeH_A(qu&U%M}T@i#e4K_`fqa-2|Em!MBsB>W>aSEo#@c)#K?aDI=X=H60__KJXZTTJ?TJ}A3aOJ zi2A18ZT$c*QVeB_KKRRSCz(Zedc6M4arWFZGn!Qo?gJI*RzB5mH->lvh%&}(?&;6W zU+S#-)g;YT)!LuZg}M_q1K z4>*v}M`(4XqRmYor^QZ6Wp{FBZymJ&^Et@jWSMQ%DWolF-3 z4tvniU1=)#`$fiE+p%HR7zx3|tO1D@38ft@0k`qKWxFk?%8CtsG{zI9<&}ijG?z0w-{!{0AmxwAh2Sc*&O!!Ga=7{sKm7jH|LTLO#!a*4XABU=~6bdDU z@~j7iQk7suOm2foRDelJxBkKe(l9yVHrG$s`Z-o_2e1dMq|s8@aMlw@rcB86a&cQC z0}~@S#)guR3+n23C+R|yKufeRXJk#;SqJ-_Ne^irH9B6uYP}-(x1#Hn_MosakTJv&Lw8C3vZwt@s=rv9NDRiJdPQUQk75V@28x;xRT`}{I?u7lAI zsP-!-ho^!F^v;XceG%^1%inD}um7+N@X_IcdCrSGL(Ye4vOKw-tB?Uhm?owM9la0a)stnpdh{dG28L?(6Bp3M)BoSX&Yu)9=VVf;JM zOHY(yex!Ht0GTM!pVIw3Wm&qoIQ%Z&*n|m9q3A#@E$5Bvkwl)trA6$SJMR;&$&s@q10KV}iXt>h8-l87 z662MfPawn2sN$8-I)1f%@ z;_te8`5(ceSF2%jHcICW3a?I$iqD1L2}Tq0YJdg0ox30QbprcX6;G1?g03l`3rckb zj`N7Bve7i*c#_9Xlv}>v8IL{=7SFMwI&S)@7?(v&?V&A$l@R|bp&0jmEL&pXtVr_W zTARpcUOT?8ksh~JjxAqK@%~M&(Eesq8Isf=NP?;Vko!kS|D^aYQPn0&5x(Nb533;A z22d4!%0(^YEhVRfg+;&g|F+A$2gKRKOwZR9;{NoviprA5eygb04F1LQxsa%< z>^^S{*Fgs!wEJoP=eL$F9xm~VA64H~5u~c+_g;Q;zhNt>w?ZElx0TZORUl&slbsGu zHVntkVewSfB>7dC4SQVchP4sc_>!2@2UQE>g8Is@HpKWV4`htaB_F44kkoLv`n^Dv z?UTls32L^;&5gASBGnTxN26PRXYV6ZSNWXCY8cZw*Sg+!zu=UZa7QlER^>kXylT`v zJCvazYTG7!Pbae7cb9Q$sxdePy_msSJW3NiSoGJE#SX9*9g)yVTJWJQOUBVP%B^6X)znOin+=I}6;VrwbnVa9t ze!jn%x%PL8q#`unkY+&-gpmR+q;$;0D}* zPvBel9{zy0)s**Ngk87>i+BJJ;!AiAh1D2U+heBg{&7>6l2fO&4vMI1sR-o!Teo%$ zpQYSp%;NwK<1=_3n<@W3T!@G9bNmi3;ALDu`H$k`cp5*(i})K(q1+|743FS>6xLj& zc~DrLl|s)DoO>#D59{l21TW(q1br842#~=gxEBRJi^uUKeukXcDWCxtV~9ubG~P~u8C-|GxDTJkH}C^&rl7lVHNK7K@K0=@fcdx)U%_KI zp};o@JO@wVNB9|jgR=?PgP-7C1X+v^VHPbsjPKx2cnPmyCxN%%UVLp-Aq}I?)T;!X zhFkF*Ucj3O+=O%S0o;H&Jc(zqiGXWxJ(lnn`~&~Sc?5rOQ~|TDRX{yKyYWRlgsu#D07pn+Y1?3H%!4OX->gs07)KPvL3;4B;_6j;HWb{1#IL zT8q874nY$FJc?(qi=cl8h^TR`X}fm5^3|)tT#&OJ*+q${ecZ0{$YJo9re{r~PmYhnw@wTW?TXTs<>bL3D<39=b)!`m_vcpq z)$Uxbv?Sg3bX^M!%gm~Fc5@s=-fEs~`D>i`ny;1Zw^}reP~IXlG~X{8YEj=Phsu$4 z@wTzQORAdW#*Z=G;m)`b-DB;OF{~%!i+ed2cPCc*m`dYT>2GGUz4BkQj`f9D`O_HR z47FEphUd!PXvSYFpO@rAhm0RrUwN!j3ud4t)6px-)Pg`hFl`mlWHsk#`DF1H&2ftT XzM2h7@@!~_uk9xy?5_-X5N!bHi-l3z0d!-f7pk8nauZn=XcKePMKsfoYe08S8L0hKk4p` zW~|n<$JWFX*4Vm)8MQL$j|M5Gix)$(`~Dx^@&V+pw~PcOInGT8L`Jk z&6rtlCG1SPQ)MJx)3DGz&TP!2``}R>2?y=5S(i+@YjrBt7_MoESk*!0;~2+M(Z)4n zs~t7bR3sTT6A5#Tgso0kHSw68Or&a(%CenfnMrpL^^VELj8V2OGqY7Nb4F%qX1LMH zXm4ydJrnAhZdaRJ(`YHD){CcR&Rp-FZ&^kgP@=224Fo?aR9 zZz~b5$)tCwu1mzD!BkzHm5^w|FZk4L!?KC63I^dycsl$CjI?t9aT0tI{sjIJc5Ln1 z^?`@OQ{idwZultt0`Atv^`8#sz{M~QFNT-HtKqva6Y7_49nNdIeOokLnKYeIu`ZKt zH8Ycx_74pne!;6$w?C|hYv2X&Liic%PxbeP`@w7B8}LK;Is84GK=qG>C%`A+3-IUg zcd!H1t%S4St?)CL2@N&+ugio+=!!mz@Xl}wJOnnw-@`EoeE_yWz*txbH^2r~EHz06lxE8((KZo5ATn>lA{o&!T4n7WFh2;ob z441)1_zU-+5S)Td@FVyMT!7Fs;p-mZy}Y@9C4zQ>-5{IL zvJ)xzO88s&H5^ERhr$Fr5pIGn!uR2qa5Tck!2RGdct2DvUR3xl2%ZJ!!aLz$gpPzw z@OXF!d>?i|SPvM6ufpL7IvfUJBm5z}2bL60yt3{HwcrN01-=dc00$uS5_k=~0nR7z zW$PvE~`9|R7BL70L+fP)cw5L^e>!>jap zgkA)n@Dn}~A+wU3|;~CZh0mG&xYs2%iweHW7r#^x9(1NcjL5CtCzlW;S95B>~xLCBf#0r)dmfv|Jn+weV);GW*uL<^pZpb%_?r@}Mf zU*R4I`ZfF;{174kfSo9KEIb*W3$KBVcQ3tCtU8;!7Wa8FnV&xCiwT@X3~o(E?l*n+F!{cs3E4uNOEhu{}b3GLy{T`hD3 z0?vmo!%hg84_Cm|@KpE=JQzV2z)N5=3?O6%oClADn_x+ydm-#Vs0ZI;9RC@PN6-QA zKKL8h6+u<-boeVc7$KwJB)9~=2n&R+M?eic5w3%`!lMxqg7?A^2$%;~z%}p=*c$ehL2t_eWS1z5|azNC>WlPnHsR5(N!`55Q>@v;-ao8{ld1ZTKsA z2tsa!Tj87VbJz|+dwGO)^LFiOg!G1e;a%``_#5~&?0}F_a4Bq0!NcKN_)FLoA+|?Q zSFfN(3b+S82b(G2Q+PH3Z-%eK-@+sXTn0D8J76<>396jx;uWITA0|+1l=XxQ;Ct{V@Ly2%LD7Jd2sj760sjI!qVzffJ?E!z zIgWpVZ3!?A{v7rt;6y)#6LGi-CUJNT?vA2~@X}HW$LlJ=egxbX{t)g>;Fmqh+Iu@g z2fCL)U2(is2g31oIL)J|XrQAAbOL;VKySiU1nTFfXg>mE;Pr4Af###+G(ScCaC`+0 zBESF?jDipQ1sYC(L*Zd?UjiHlZ-OdOi?)#-vmX-RrzmQ`@uzU6pQ2-MyZ|o8@e%k6 zysuOsU8QGGaw&m+49jrr>j}`#n?E|h!2}phiC@7c9B=Xq@H7D`aNHkK9-Zj_<&`kiG@B_X{ux1pxxQgyYdT zzUmjCoB+qd8{r28YEvpe0K==`D=6rU;|m_gw%&s5xC0>f-u{hoY@AfNkA&#wZ*oTsi@GG$wj*D=dgu^a=CHBB!Cmc?KfAA~uK1w_X z!^dD}zY_1qu@er*z-#>+e~e=nBtMAb8b8MlI9`q8@8Dly$$mTy$2&2+m=d4&D{)sG zPsj0QxXRD57lte0aqxavQfd<=Zon{2N%#7d_!5qL;&>O_+t0BY!y0%pjdPrz<4{U8 zar_OARep|P9AAW2;F$4qJRXOUI2;f6@pCkByaYZA|A6$8-Ej(B{<0PN{k!e9)43h(rD z*bT!qw9*-V4x?~*6o= z$eVC{*3ZGlVKNR!__fl>IQ$*MA$}#Cg~NIr4)trLvvHV0fGDc*U7E-CGh9TGVZWl!B;vk)hHEjr-mmCJ3}^cpMlqb`XSfW*H~b8D#_+Fx zhFi&^EBy>l!SHHWvS+$rFa!w&hC5;Ky2qfEH(|BGSAGU@kLaQi&i6Ce*HfFK5jbC_ zYLPWaR~YeFG7*nNtU#@m2rLNKgpzhmC}GALtXL+!Yx8A6E76!}-ddN6)jK9tatC+J z@sRTa3(wDzd$Jo=TCrp}A~%l@$d6xD6N-lAF7a?`f=s&KQY#z_*K1`9Qgz9?a76Ax&!ht_TqSfW*<=Q5t(pe0 zS9Pl>)ZTnS#7;WVMS0=rd24DDW-1bH$fSoBo|Ph2g=0#MUE91l+NkaePgfjl*-1zI zQY&JH?LaD48xYIpt;r^-;gG^GOJY_kY9_%|GEBEG2j&M|Cf_Wm5v}3oO-aj1vO)QI z;53e#fx7A~(tK%H@=J0{GFhZZH)Kn3uoPg$0#Z^?Dkdf+#)4B)LM5|QZ)9Qf zR>^R(5+N&`v{%Lx^0k3bl3+vf>m-$3r7E;Cuf$xVRGN~ujbkcJ>2Ya5DJ@V~+6)=V zVJm1iSYjn6164~UZiHn`8s(c8lAoiNB;vIx=e(^# zH3n)^iJFikbe*)5G|@uKwpYn_Mx4|sH>8qrTMbuJol2S!=|Slt5i>3uy4IZ_5|NY| z8BRDE6sm13Pm7Cu$$~nG;EW#03i-B2JSq(&8KVZ7gj{G!KZRq`lQP-J$Q2y~tha)R zxavbIu-pvA63v^P31f6%&)my58R4rXEBy_KoCJqH;a+E3kP)jVNubpq=c*dSPH^@wza>ipOJ#m2*wpD)G zx^TC1RFkVLQr&GN%sT0acwL>IXe+Y~<7QRBOf_$kVk<2t8EGaJLmExBe9k^sbPrds zT7j0Q>gz10vnCYOdzExnEZ}@SNGc*DLGnedGoDI}EKvy(Hn6;TYpk}xN;IjQoW@$^ zHnoh9Dre4M88}c=k=g`>1oI^I#1OnTU{sZW!7?J(9A7{_?+gg3@%Amr|z@!Qru}oGw2LeGgVhFvwN+Rdv>7S zYK)l4CY3qaf<!F3gJs zvSD#jxp_-nojb?(pXJV3?!>7Y-qY5jy_IO02s(FWF@H;|y0O$Om*2-(i(xV$6rU;P zrlQfXGu=7OKjjd$x|k`;ok~?&7tKF4Aw}b|Y9)*JLT<9O-IjBQqb=hxuS?vo+)0PH zRghj7G1nccDw)h$Ipg+!ZVI*CInP<(S zyr8zVval2^dsWUgS7AaUOVu30v zu`)h}%E~n4s&_UWfvTizawRje{SeF!a>@jdyFSOYtFrbMp-?^+$GOjuD{) zk;Svf*vrlIf$RWM%_sv@T7?0mlinRMu_h~AFDF|zx$a4ND254Wby;wk;)EUnoKit^mrT@Y}zE}%t~v->sK1j&|GHrg^5XVQh66t^pKn-u9r zec9+U4x@QVzTb30>7pk#t z)~mVGYnyGNiCNLEe}3cY6)|hX#xrlro;~T=o(=!TVLv-oD|O(iulyU5Y^rAqkgqJJ zR^ULwGV5hJRQpXku+XfJSxMdYW3sZGG0VQi?X=wdXS1`E?UAz!b1xZWjGOL+mp|ZS zG4+oBevS*Pg!h^^lkTgxW9%GqmksWIFqac{;9yy;?7DbD-VBJVb#+g9OCxCp<#I{} zctS49Qi;6b=%;;}Po@s!MqvUJ6I%1D!i$jZFUAhCf(aTt(F3(j_zC`GgMW)F1gA~$c|g>mjYaT zWY;T6=3EQ3m}s+$Yj!1;+cEUyQQUc&zJTDh^SF;HF^B1gmJwQ@Nc zXbdt&$-lT-=o1tA!U4^q3@k_K9?wZL&OdT`RIm^@^-Y6^DLPixg;sEV*p7bs82I@ z%o;n!!~d7@pl^ZEi=ua3$h*B{yHi3(E;g-vx5(dG%J*N%?Uu6L4!Rk&cHNu>H;-sT zZSei|KyS#{4Wo+?e6;j1?~R_hTJwcqc-%_ ztPTGcl1f+h+fSO?+!}gRdyC8))!IO}Yu^2H-eIo|zQ56|E1|WOjI0fPC1^w6h~Ii% z@Qu99?dcn7n=7LYb@S-R-#xu_q>??e&9&WTZQ8J8k-E0N+O*;KAMyJu`djCru9TMY zU+imd&*V)dEgRfdn{Orl^Ub3rcq^5+M`>9pUqfAMUqfBAb|~3}+ih5~XgyrL26x(Ad&UUT>wv=sl{i}F@KPs=tJ zr46+fZ^XQo*R}H+aYH-=pK75TX|m9XSG+( z-u-kzeSV+i%YO)dt+m>+#|BMXK7ri1I%ch^o-cO~tQPkb zo!XU^rQ7L4=Q;Yd@@)M~s7t83dR%>|XZB(BaxlPo_T9$X=&V%c4t%YV(OSlgI-s{f+Y=~96eT}07} zZt_dStmq+B8`>p2nqh=o6{|meT$FW&?V)~tg8pXs&b;0o>SxnPSm%!(1bR^H!s|M< zCvv>ys>3oe9<;bhnYhTtJ^84N=GY=Q$h)|(XlU3dNDscO#Ka3Guk=fdewf3bEr z$0l3@_4hc}a(oIr8P-7k&E8s$EqDx^4UdMEP|xz6IPMQC;O=k&oC^1bO>ich1?NEn zE`W=m{>rR=qSEQSeg<3*PlYGJ6XEf216&PP!Idxt6R-~IZ{W`7Sbr~0KM3FiUOx@4 zgX5rvTH7}0ug9$z)UK?|orNql70G`oBL6Rk{BI*_M%4fFNOe@cL*h)4N6FMDUw)DM rpw?#6?W^nLwp!<(edYTP@l|$pZK_cwX)|e6N7Riic4obu3Xc6Z3Ol{~ literal 0 HcmV?d00001 diff --git a/tests/_comparison_results/tk_graph_built.pkl b/tests/_comparison_results/tk_graph_built.pkl new file mode 100644 index 0000000000000000000000000000000000000000..15b3a5d8a8ffda90b814dee8819c0109b1a5edd1 GIT binary patch literal 1766 zcmb7_TZ5&$CkSI0&Hj-1 zXPzRs&V>}oygaIEUA6mWU!-O!Y#S?CbBB#$9*viSo3kHMZJ3M~8Lu40irG|*E0=X^ zA&YO}unz(K+Qf5OFAgZjbAVHLAbm6}%|M4jPLwV3;ExSP4iN{*!!>wY(O>>_fDxm(Z=3oIs*Su2E0| z-@4{oNXT+vA0vx56Jpn-S0EOV$-&x@tJ9m{c2-J9nc`*Pt;_K(7I7IJEhojC%pe+= z2CJ_Z4{na-Tf6@4U5@R;j}XHJuy7H-$6(z?MesE{LKUYLmthG)YVAkEE%y&3@7>jLcRk$QC9K(lKmD!-{{nA3qwxR$ literal 0 HcmV?d00001 diff --git a/tests/analysis/test_graphs.py b/tests/analysis/test_graphs.py index b962f30..2d59df3 100644 --- a/tests/analysis/test_graphs.py +++ b/tests/analysis/test_graphs.py @@ -2,6 +2,7 @@ import networkx as nx import pytest from lang_main.analysis import graphs +from lang_main.errors import EmptyEdgesError, EmptyGraphError, EdgePropertyNotContainedError TK_GRAPH_NAME = 'TEST_TOKEN_GRAPH' @@ -40,13 +41,18 @@ def build_init_graph(token_graph: bool): @pytest.fixture(scope='module') -def graph(): +def graph() -> graphs.DiGraph: return build_init_graph(token_graph=False) @pytest.fixture(scope='module') -def tk_graph(): - return build_init_graph(token_graph=True) +def tk_graph() -> graphs.TokenGraph: + return build_init_graph(token_graph=True) # type: ignore + + +@pytest.fixture(scope='module') +def tk_graph_undirected(tk_graph) -> graphs.Graph: + return tk_graph.undirected def test_graph_size(graph): @@ -61,7 +67,45 @@ def test_save_to_GraphML(graph, tmp_path): assert saved_file.exists() -def test_metadata_retrieval(graph): +def test_save_load_pickle_tk_graph(tk_graph, tmp_path): + filename = 'test_save_tkg' + tk_graph.to_pickle(tmp_path, filename) + load_pth = (tmp_path / filename).with_suffix('.pkl') + assert load_pth.exists() + loaded_graph = graphs.TokenGraph.from_file(load_pth) + assert loaded_graph.nodes == tk_graph.nodes + assert loaded_graph.edges == tk_graph.edges + filename = None + tk_graph.to_pickle(tmp_path, filename) + load_pth = (tmp_path / tk_graph.name).with_suffix('.pkl') + assert load_pth.exists() + loaded_graph = graphs.TokenGraph.from_file(load_pth) + assert loaded_graph.nodes == tk_graph.nodes + assert loaded_graph.edges == tk_graph.edges + + +@pytest.mark.parametrize( + 'import_graph,directed', [('tk_graph', True), ('tk_graph_undirected', False)] +) +def test_save_load_GraphML_tk_graph(import_graph, tk_graph, directed, tmp_path, request): + test_graph = request.getfixturevalue(import_graph) + filename = 'test_save_tkg' + tk_graph.to_GraphML(tmp_path, filename, directed=directed) + load_pth = (tmp_path / filename).with_suffix('.graphml') + assert load_pth.exists() + loaded_graph = graphs.TokenGraph.from_file(load_pth, node_type_graphml=int) + assert loaded_graph.nodes == test_graph.nodes + assert loaded_graph.edges == test_graph.edges + filename = None + tk_graph.to_GraphML(tmp_path, filename, directed=directed) + load_pth = (tmp_path / tk_graph.name).with_suffix('.graphml') + assert load_pth.exists() + loaded_graph = graphs.TokenGraph.from_file(load_pth, node_type_graphml=int) + assert loaded_graph.nodes == test_graph.nodes + assert loaded_graph.edges == test_graph.edges + + +def test_get_graph_metadata(graph): metadata = graphs.get_graph_metadata(graph) assert metadata['num_nodes'] == 4 assert metadata['num_edges'] == 6 @@ -72,7 +116,7 @@ def test_metadata_retrieval(graph): assert metadata['total_memory'] == 448 -def test_graph_update_batch(): +def test_update_graph_batch(): graph_obj = build_init_graph(token_graph=False) graphs.update_graph(graph_obj, batch=((4, 5), (5, 6)), weight_connection=8) metadata = graphs.get_graph_metadata(graph_obj) @@ -82,7 +126,7 @@ def test_graph_update_batch(): assert metadata['max_edge_weight'] == 8 -def test_graph_update_single_new(): +def test_update_graph_single_new(): graph_obj = build_init_graph(token_graph=False) graphs.update_graph(graph_obj, parent=4, child=5, weight_connection=7) metadata = graphs.get_graph_metadata(graph_obj) @@ -92,7 +136,7 @@ def test_graph_update_single_new(): assert metadata['max_edge_weight'] == 7 -def test_graph_update_single_existing(): +def test_update_graph_single_existing(): graph_obj = build_init_graph(token_graph=False) graphs.update_graph(graph_obj, parent=1, child=4, weight_connection=5) metadata = graphs.get_graph_metadata(graph_obj) @@ -103,13 +147,13 @@ def test_graph_update_single_existing(): @pytest.mark.parametrize('cast_int', [True, False]) -def test_graph_undirected_conversion(graph, cast_int): +def test_convert_graph_to_undirected(graph, cast_int): graph_undir = graphs.convert_graph_to_undirected(graph, cast_int=cast_int) # edges: (1, 2, w=1) und (2, 1, w=6) --> undirected: (1, 2, w=7) assert graph_undir[1][2]['weight'] == pytest.approx(7.0) -def test_graph_cytoscape_conversion(graph): +def test_convert_graph_to_cytoscape(graph): cyto_graph, weight_data = graphs.convert_graph_to_cytoscape(graph) node = cyto_graph[0] edge = cyto_graph[-1] @@ -144,7 +188,17 @@ def test_tk_graph_properties(tk_graph): assert metadata_undirected['total_memory'] == 392 -def test_graph_degree_filter(tk_graph): +def test_filter_graph_by_edge_weight(tk_graph): + filtered_graph = graphs.filter_graph_by_edge_weight( + tk_graph, + bound_lower=2, + bound_upper=5, + ) + assert not filtered_graph.has_edge(1, 2) + assert not filtered_graph.has_edge(2, 1) + + +def test_filter_graph_by_node_degree(tk_graph): filtered_graph = graphs.filter_graph_by_node_degree( tk_graph, bound_lower=3, @@ -153,7 +207,7 @@ def test_graph_degree_filter(tk_graph): assert len(filtered_graph.nodes) == 2 -def test_graph_edge_number_filter(tk_graph): +def test_filter_graph_by_number_edges(tk_graph): number_edges_limit = 1 filtered_graph = graphs.filter_graph_by_number_edges( tk_graph, @@ -166,3 +220,75 @@ def test_graph_edge_number_filter(tk_graph): bound_upper=None, ) assert len(filtered_graph.nodes) == 2, 'one edge should result in only two nodes' + + +def test_add_weighted_degree(): + graph_obj = build_init_graph(token_graph=False) + property_name = 'degree_weighted' + graphs.add_weighted_degree(graph_obj, 'weight', property_name) + assert graph_obj.nodes[1][property_name] == 14 + assert graph_obj.nodes[2][property_name] == 10 + assert graph_obj.nodes[3][property_name] == 6 + + +def test_static_graph_analysis(): + graph_obj = build_init_graph(token_graph=True) + (graph_obj,) = graphs.static_graph_analysis(graph_obj) # type: ignore + property_name = 'degree_weighted' + assert graph_obj.nodes[1][property_name] == 14 + assert graph_obj.nodes[2][property_name] == 10 + assert graph_obj.nodes[3][property_name] == 6 + assert graph_obj.undirected.nodes[1][property_name] == 14 + assert graph_obj.undirected.nodes[2][property_name] == 10 + assert graph_obj.undirected.nodes[3][property_name] == 6 + + +def test_pipe_add_graph_metrics(): + graph_obj = build_init_graph(token_graph=False) + graph_obj_undir = graphs.convert_graph_to_undirected(graph_obj, cast_int=True) + graph_collection = graphs.pipe_add_graph_metrics(graph_obj, graph_obj_undir) + property_name = 'degree_weighted' + assert graph_collection[0].nodes[1][property_name] == 14 + assert graph_collection[0].nodes[2][property_name] == 10 + assert graph_collection[0].nodes[3][property_name] == 6 + assert graph_collection[1].nodes[1][property_name] == 14 + assert graph_collection[1].nodes[2][property_name] == 10 + assert graph_collection[1].nodes[3][property_name] == 6 + + +def test_pipe_rescale_graph_edge_weights(tk_graph): + rescaled_tkg, rescaled_undir = graphs.pipe_rescale_graph_edge_weights(tk_graph) + assert rescaled_tkg[2][1]['weight'] == pytest.approx(1.0) + assert rescaled_tkg[1][2]['weight'] == pytest.approx(0.0952) + assert rescaled_undir[2][1]['weight'] == pytest.approx(1.0) + assert rescaled_undir[1][2]['weight'] == pytest.approx(1.0) + + +@pytest.mark.parametrize('import_graph', ['graph', 'tk_graph']) +def test_rescale_edge_weights(import_graph, request): + test_graph = request.getfixturevalue(import_graph) + rescaled_graph = graphs.rescale_edge_weights(test_graph) + assert rescaled_graph[2][1]['weight'] == pytest.approx(1.0) + assert rescaled_graph[1][2]['weight'] == pytest.approx(0.0952) + + +@pytest.mark.parametrize('import_graph', ['graph', 'tk_graph']) +def test_verify_property(import_graph, request): + test_graph = request.getfixturevalue(import_graph) + test_property = 'centrality' + with pytest.raises(EdgePropertyNotContainedError): + graphs.verify_property(test_graph, property=test_property) + test_property = 'weight' + assert not graphs.verify_property(test_graph, property=test_property) + + +def test_verify_non_empty_graph(): + graph = nx.Graph() + with pytest.raises(EmptyGraphError): + graphs.verify_non_empty_graph(graph) + graph.add_nodes_from([1, 2, 3, 4]) + with pytest.raises(EmptyEdgesError): + graphs.verify_non_empty_graph(graph, including_edges=True) + assert not graphs.verify_non_empty_graph(graph, including_edges=False) + graph.add_edges_from([(1, 2), (1, 3), (2, 4)]) + assert not graphs.verify_non_empty_graph(graph, including_edges=True) diff --git a/tests/analysis/test_preprocessing.py b/tests/analysis/test_preprocessing.py index eb6caf9..bc87f15 100644 --- a/tests/analysis/test_preprocessing.py +++ b/tests/analysis/test_preprocessing.py @@ -2,8 +2,11 @@ executed in in a pipeline """ +from pathlib import Path +from lang_main import model_loader from lang_main.analysis import preprocessing as ppc from lang_main.analysis import shared +from lang_main.types import LanguageModels, STFRModelTypes def test_load_data(raw_data_path, raw_data_date_cols): @@ -71,3 +74,43 @@ def test_analyse_feature(raw_data_path, raw_data_date_cols): (data,) = ppc.analyse_feature(data, target_feature=target_features[0]) assert len(data) == 139 + + +def test_numeric_pre_filter_feature(data_analyse_feature, data_numeric_pre_filter_feature): + # Dataset contains 139 entries. The feature "len" has a minimum value of 15, + # which occurs only once. If all values >= are retained only one entry should be + # filtered. This results in a total number of 138 entries. + (data,) = ppc.numeric_pre_filter_feature( + data=data_analyse_feature, + feature='len', + bound_lower=16, + bound_upper=None, + ) + assert len(data) == 138 + eval_merged = data[['entry', 'len', 'num_occur', 'num_assoc_obj_ids']] + eval_benchmark = data_numeric_pre_filter_feature[ + ['entry', 'len', 'num_occur', 'num_assoc_obj_ids'] + ] + assert bool((eval_merged == eval_benchmark).all(axis=None)) + + +def test_merge_similarity_duplicates(data_analyse_feature, data_merge_similarity_duplicates): + cos_sim_threshold = 0.8 + # reduce dataset to 10 entries + data = data_analyse_feature.iloc[:10] + model = model_loader.load_sentence_transformer( + model_name=STFRModelTypes.ALL_MPNET_BASE_V2, + ) + (merged_data,) = ppc.merge_similarity_duplicates( + data=data, + model=model, + cos_sim_threshold=cos_sim_threshold, + ) + # constructed use case: with this threshold, + # 2 out of 10 entries are merged into one + assert len(merged_data) == 9 + eval_merged = merged_data[['entry', 'len', 'num_occur', 'num_assoc_obj_ids']] + eval_benchmark = data_merge_similarity_duplicates[ + ['entry', 'len', 'num_occur', 'num_assoc_obj_ids'] + ] + assert bool((eval_merged == eval_benchmark).all(axis=None)) diff --git a/tests/analysis/test_tokens.py b/tests/analysis/test_tokens.py new file mode 100644 index 0000000..dc16ef2 --- /dev/null +++ b/tests/analysis/test_tokens.py @@ -0,0 +1,79 @@ +from pathlib import Path + +import pytest + +from lang_main import model_loader +from lang_main.analysis import graphs, tokens +from lang_main.types import SpacyModelTypes + +SENTENCE = ( + 'Ich ging am 22.05. mit ID 0912393 schnell über die Wiese zu einem Menschen, ' + 'um ihm zu helfen. Ich konnte nicht mit ansehen, wie er Probleme beim Tragen ' + 'seiner Tasche hatte.' +) + + +@pytest.fixture(scope='module') +def spacy_model(): + model = model_loader.load_spacy( + model_name=SpacyModelTypes.DE_CORE_NEWS_SM, + ) + return model + + +def test_pre_clean_word(): + string = 'Öl3bad2024prüfung' + assert tokens.pre_clean_word(string) == 'Ölbadprüfung' + + +def test_is_str_date(): + string = '22.05.' + assert tokens.is_str_date(string, fuzzy=True) + string = '22.05.2024' + assert tokens.is_str_date(string) + string = '22-05-2024' + assert tokens.is_str_date(string) + string = '9009090909' + assert not tokens.is_str_date(string) + string = 'hello347' + assert not tokens.is_str_date(string) + + +# TODO: depends on fixed Constants +def test_obtain_relevant_descendants(spacy_model): + doc = spacy_model(SENTENCE) + sent1 = tuple(doc.sents)[0] # first sentence + word1 = sent1[1] # word "ging" (POS:VERB) + descendants1 = ('0912393', 'schnell', 'Wiese', 'Menschen') + rel_descs = tokens.obtain_relevant_descendants(word1) + rel_descs = tuple((token.text for token in rel_descs)) + assert descendants1 == rel_descs + + sent2 = tuple(doc.sents)[1] # first sentence + word2 = sent2[1] # word "konnte" (POS:AUX) + descendants2 = ('mit', 'Probleme', 'Tragen', 'Tasche') + rel_descs = tokens.obtain_relevant_descendants(word2) + rel_descs = tuple((token.text for token in rel_descs)) + assert descendants2 == rel_descs + + +def test_add_doc_info_to_graph(spacy_model): + doc = spacy_model(SENTENCE) + tk_graph = graphs.TokenGraph() + tokens.add_doc_info_to_graph(tk_graph, doc, weight=2) + assert len(tk_graph.nodes) == 11 + assert len(tk_graph.edges) == 17 + assert '0912393' in tk_graph.nodes + + +def test_build_token_graph( + data_merge_similarity_duplicates, + spacy_model, + data_tk_graph_built, +): + tk_graph, _ = tokens.build_token_graph( + data=data_merge_similarity_duplicates, + model=spacy_model, + ) + assert len(tk_graph.nodes) == len(data_tk_graph_built.nodes) + assert len(tk_graph.edges) == len(data_tk_graph_built.edges) diff --git a/tests/conftest.py b/tests/conftest.py index 244efcf..c2f44e6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,7 @@ from pathlib import Path +from lang_main.analysis import graphs +import pandas as pd import pytest DATE_COLS: tuple[str, ...] = ( @@ -12,7 +14,7 @@ DATE_COLS: tuple[str, ...] = ( @pytest.fixture(scope='session') def raw_data_path(): - pth_data = Path('./tests/Dummy_Dataset_N_1000.csv') + pth_data = Path('./tests/_comparison_results/Dummy_Dataset_N_1000.csv') assert pth_data.exists() return pth_data @@ -21,3 +23,27 @@ def raw_data_path(): @pytest.fixture(scope='session') def raw_data_date_cols(): return DATE_COLS + + +@pytest.fixture(scope='session') +def data_analyse_feature() -> pd.DataFrame: + pth_data = Path('./tests/_comparison_results/analyse_feature.pkl') + return pd.read_pickle(pth_data) + + +@pytest.fixture(scope='session') +def data_numeric_pre_filter_feature() -> pd.DataFrame: + pth_data = Path('./tests/_comparison_results/numeric_pre_filter.pkl') + return pd.read_pickle(pth_data) + + +@pytest.fixture(scope='session') +def data_merge_similarity_duplicates() -> pd.DataFrame: + pth_data = Path('./tests/_comparison_results/merge_similarity_candidates.pkl') + return pd.read_pickle(pth_data) + + +@pytest.fixture(scope='session') +def data_tk_graph_built(): + pth_data = Path('./tests/_comparison_results/tk_graph_built.pkl') + return graphs.TokenGraph.from_file(pth_data)