started adding comprehensive unit tests

2024-11-13 17:54:47 +01:00 · 2024-11-13 17:54:47 +01:00 · 6781b4a132
commit 6781b4a132
parent a0ca71ea87
32 changed files with 4042 additions and 1430 deletions
--- a/lang_main_config.toml
+++ b/lang_main_config.toml
@ -2,28 +2,27 @@
 [paths]
 inputs = './inputs/'
-results = './results/test_new2/'
+# results = './results/dummy_N_1000/'
-dataset = './01_2_Rohdaten_neu/Export4.csv'
+# dataset = '../data/Dummy_Dataset_N_1000.csv'
-#results = './results/Export7/'
+results = './results/test_20240807/'
-#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
+dataset = '../data/02_202307/Export4.csv'
 #results = './results/Export7_trunc/'
 #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
 [logging]
 enabled = true
 stderr = true
 file = true
 # only debugging features, production-ready pipelines should always
 # be fully executed
 [control]
-preprocessing = true
+preprocessing_skip = true
 preprocessing_skip = false
 token_analysis = false
 token_analysis_skip = false
 graph_postprocessing = false
 graph_postprocessing_skip = false
-time_analysis = false
+graph_rescaling_skip = false
-time_analysis_skip = false
+graph_static_rendering_skip = false
-
+time_analysis_skip = true
 #[export_filenames]
 #filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
 [preprocess]
 filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
 date_cols = [
    "VorgangsDatum",
    "ErledigungsDatum",
@ -34,17 +33,25 @@ threshold_amount_characters = 5
 threshold_similarity = 0.8
 [graph_postprocessing]
-threshold_edge_weight = 150
+threshold_edge_number = 330
 # threshold_edge_weight = 150
 [time_analysis.uniqueness]
 threshold_unique_texts = 4
 criterion_feature = 'HObjektText'
 feature_name_obj_id = 'ObjektID'
 [time_analysis.preparation]
 name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
 name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
 [time_analysis.model_input]
 # input_features = [
 #     'VorgangsTypName',
 #     'VorgangsArtText',
 #     'VorgangsBeschreibung',
 # ]
 input_features = [
    'VorgangsTypName',
    'VorgangsArtText',
    'VorgangsBeschreibung',
 ]
 activity_feature = 'VorgangsTypName'
--- a/notebooks/lang_main_config.toml
+++ b/notebooks/lang_main_config.toml
@ -1,58 +0,0 @@
 # lang_main: Config file
 [paths]
 inputs = './inputs/'
 # results = './results/dummy_N_1000/'
 # dataset = '../data/Dummy_Dataset_N_1000.csv'
 results = './results/'
 dataset = '../data/02_202307/Export4.csv'
 # only debugging features, production-ready pipelines should always
 # be fully executed
 [control]
 preprocessing_skip = true
 token_analysis_skip = false
 graph_postprocessing_skip = false
 graph_rescaling_skip = false
 graph_static_rendering_skip = false
 time_analysis_skip = true
 [preprocess]
 date_cols = [
    "VorgangsDatum",
    "ErledigungsDatum",
    "Arbeitsbeginn",
    "ErstellungsDatum",
 ]
 threshold_amount_characters = 5
 threshold_similarity = 0.8
 [graph_postprocessing]
 threshold_edge_number = 300
 threshold_edge_weight = 150
 [time_analysis.uniqueness]
 threshold_unique_texts = 4
 criterion_feature = 'HObjektText'
 feature_name_obj_id = 'ObjektID'
 [time_analysis.preparation]
 name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
 name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
 [time_analysis.model_input]
 # input_features = [
 #     'VorgangsTypName',
 #     'VorgangsArtText',
 #     'VorgangsBeschreibung',
 # ]
 input_features = [
    'VorgangsBeschreibung',
 ]
 activity_feature = 'VorgangsTypName'
 activity_types = [
    'Reparaturauftrag (Portal)',
    'Störungsmeldung',
 ]
 threshold_num_acitivities = 1
 threshold_similarity = 0.8
--- a/notebooks/misc.ipynb
+++ b/notebooks/misc.ipynb
@ -19,6 +19,900 @@
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "c0dab307-2c2c-41d2-9867-ec9ba82a8099",
   "metadata": {},
   "outputs": [],
   "source": [
    "import networkx as nx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "629f2051-7ef0-4ce0-a5ad-86b292cc20af",
   "metadata": {},
   "outputs": [],
   "source": [
    "edge_weighst = [\n",
    "    {'weight': 1},\n",
    "    {'weight': 2},\n",
    "    {'weight': 3},\n",
    "    {'weight': 4},\n",
    "    {'weight': 5},\n",
    "    {'weight': 6},\n",
    "]\n",
    "edges = [\n",
    "    (1, 2),\n",
    "    (1, 3),\n",
    "    (2, 4),\n",
    "    (3, 4),\n",
    "    (1, 4),\n",
    "    (2, 1),\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c4fd9997-1e41-49f1-b879-4b3a6571931d",
   "metadata": {},
   "outputs": [],
   "source": [
    "edges_to_add = []\n",
    "for i, edge in enumerate(edges):\n",
    "    edge = list(edge)\n",
    "    edge.append(edge_weighst[i])\n",
    "    edges_to_add.append(tuple(edge))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "bdf1c8d2-1093-420e-91fa-e2edd0cd72f1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(1, 2, {'weight': 1}),\n",
       " (1, 3, {'weight': 2}),\n",
       " (2, 4, {'weight': 3}),\n",
       " (3, 4, {'weight': 4}),\n",
       " (1, 4, {'weight': 5}),\n",
       " (2, 1, {'weight': 6})]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "edges_to_add"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "d017b2bc-9cd3-4124-afed-c6eabc07a540",
   "metadata": {},
   "outputs": [],
   "source": [
    "G = nx.DiGraph()\n",
    "G.add_edges_from(edges_to_add)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "91d4094b-f886-4056-a697-5223f157f1d3",
   "metadata": {},
   "outputs": [],
   "source": [
    "tk = graphs.TokenGraph()\n",
    "tk.add_edges_from(edges_to_add)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "518cada9-561a-4b96-b750-3d500d1d28b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from lang_main.analysis import graphs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "3235f188-6e99-4855-aa3d-b0e04e3db319",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'num_nodes': 4,\n",
       " 'num_edges': 6,\n",
       " 'min_edge_weight': 1,\n",
       " 'max_edge_weight': 6,\n",
       " 'node_memory': 112,\n",
       " 'edge_memory': 336,\n",
       " 'total_memory': 448}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "graphs.get_graph_metadata(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ca2ce8e8-d72a-4edf-ae42-0f79bd9d19a2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "223dc592-fa56-4536-a5c2-a166001a6aca",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>6.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     1    2    3    4\n",
       "1  0.0  1.0  2.0  5.0\n",
       "2  6.0  0.0  0.0  3.0\n",
       "3  0.0  0.0  0.0  4.0\n",
       "4  0.0  0.0  0.0  0.0"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nx.to_pandas_adjacency(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "1f677da0-7416-413c-adb1-ae1384e09349",
   "metadata": {},
   "outputs": [],
   "source": [
    "G_undir = graphs.convert_graph_to_undirected(G, cast_int=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "356862fb-2383-43d9-80ba-4fe83646c9d9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     1    2    3    4\n",
       "1  0.0  7.0  2.0  5.0\n",
       "2  7.0  0.0  0.0  3.0\n",
       "3  2.0  0.0  0.0  4.0\n",
       "4  5.0  3.0  4.0  0.0"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nx.to_pandas_adjacency(G_undir)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "b8a3db1a-0d2a-4635-ab88-7802e2cf59e4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "G_undir.is_directed()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "46001528-75b0-4fe8-a3ec-353bbd3eeeff",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'weight': 7.0}"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "G_undir[1][2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "cf2dcdff-f0b7-416e-9db3-c7a21ea96b96",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "([{'data': {'id': 1, 'label': 1}},\n",
       "  {'data': {'id': 2, 'label': 2}},\n",
       "  {'data': {'id': 3, 'label': 3}},\n",
       "  {'data': {'id': 4, 'label': 4}},\n",
       "  {'data': {'source': 1, 'target': 2, 'weight': 1}},\n",
       "  {'data': {'source': 1, 'target': 3, 'weight': 2}},\n",
       "  {'data': {'source': 1, 'target': 4, 'weight': 5}},\n",
       "  {'data': {'source': 2, 'target': 4, 'weight': 3}},\n",
       "  {'data': {'source': 2, 'target': 1, 'weight': 6}},\n",
       "  {'data': {'source': 3, 'target': 4, 'weight': 4}}],\n",
       " {'min': 1, 'max': 6})"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "graphs.convert_graph_to_cytoscape(G)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "f82481e9-873f-4657-80d3-ba75af74fa27",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TokenGraph(name: TokenGraph, number of nodes: 4, number of edges: 6)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tk.update_metadata()\n",
    "tk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "4b806620-b469-45ef-823b-db46f8590509",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(1, 4), (2, 3), (3, 2), (4, 3)]"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(G.degree)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "2a41d019-1b6b-46f7-b13e-ac22da737940",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "G.degree[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "e99f2fb4-4c8d-4564-810d-a4b2ed9d6009",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(1, 4), (2, 3), (3, 2), (4, 3)]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(tk.degree)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "1368ebf6-e008-492d-8d15-fe3ed12b78a3",
   "metadata": {},
   "outputs": [],
   "source": [
    "g_filt = graphs.filter_graph_by_node_degree(\n",
    "        tk,\n",
    "        bound_lower=3,\n",
    "        bound_upper=3,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "de69f73e-da1d-4479-81da-006f2ce61844",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TokenGraph(name: TokenGraph, number of nodes: 2, number of edges: 1)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "g_filt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "b12fd64d-737e-4c68-94ea-72a817647a04",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[2, 4]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(g_filt.nodes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "21434c7c-887c-4f9f-884a-48514e2279e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "G = nx.DiGraph()\n",
    "G.add_edges_from(edges_to_add)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "2defef69-f09a-4869-984a-27b6373b17b9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'weight': 1}"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "G[1][2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "0308a2ac-f554-4e24-9ddb-578dd588f3c8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(G.edges)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15d9ce65-f9a5-40de-a737-098579f6a8ee",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7acf4be7-45f3-45e6-87f5-14343f23d610",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "9139812b-74ba-45ce-adfc-e57667259692",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loaded TOML config file successfully.\n"
     ]
    }
   ],
   "source": [
    "from lang_main import search_iterative, search_base_path\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "35118922-3a17-4698-93bc-5292a276a4b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "from lang_main import constants"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "341299bf-e926-4e55-8545-8805a186f49c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "WindowsPath('A:/Arbeitsaufgaben/lang-models')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "constants.MODEL_BASE_FOLDER"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "11ce4062-b229-4d88-967d-6eeb6d0135b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sentence_transformers import SentenceTransformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "40dac543-1e53-4fd8-a192-88f3527872b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_kwargs = {\n",
    "    'file_name': 'onnx/model_quint8_avx2.onnx',\n",
    "    'provider': 'CPUExecutionProvider',\n",
    "    'export': False,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8a0eaa9f-e2d2-4106-b80b-80916e9d8bfe",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "The ONNX file model_quint8_avx2.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "215b46e3607e4530b2d8f8227367ef23",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model_quint8_avx2.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "A:\\Arbeitsaufgaben\\lang-main\\.venv\\Lib\\site-packages\\huggingface_hub\\file_download.py:139: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in A:\\Arbeitsaufgaben\\lang-models\\models--sentence-transformers--all-MiniLM-L6-v2. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n",
      "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n",
      "  warnings.warn(message)\n"
     ]
    }
   ],
   "source": [
    "stfr = SentenceTransformer('all-MiniLM-L6-v2', similarity_fn_name='cosine', backend='onnx', model_kwargs=model_kwargs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "cd921aca-0673-41ec-98a3-18e360a39a41",
   "metadata": {},
   "outputs": [],
   "source": [
    "from lang_main.constants import SPACY_MODEL_NAME"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "88173a68-7d8e-4f4c-a4ad-bbf78efaf781",
   "metadata": {},
   "outputs": [],
   "source": [
    "import importlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e5293976-22ab-406a-ba32-066fd7254394",
   "metadata": {},
   "outputs": [],
   "source": [
    "mod = importlib.import_module(SPACY_MODEL_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6023a339-02da-429c-acf5-f14a56989357",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<module 'de_dep_news_trf' from 'A:\\\\Arbeitsaufgaben\\\\lang-main\\\\.venv\\\\Lib\\\\site-packages\\\\de_dep_news_trf\\\\__init__.py'>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mod"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "5f4fa066-fa0f-4818-9cf9-ec28923150ba",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loaded TOML config file successfully.\n"
     ]
    }
   ],
   "source": [
    "from lang_main.analysis.shared import clean_string_slim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "71286836-7eb2-4095-ab82-42d7ac7ed476",
   "metadata": {},
   "outputs": [],
   "source": [
    "string = 'Ölleckage   durch\\nundichten \\t Ölsumpf,, aber Dichtung intakt??!!!'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "e8284e76-e750-458e-bb63-d59d6d57a396",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ölleckage   durch\n",
      "undichten \t Ölsumpf,, aber Dichtung intakt??!!!\n"
     ]
    }
   ],
   "source": [
    "print(string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "82e98d8f-2e24-42f9-a3ed-3b3454ae64f4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clean_string_slim(string)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b527b145-15d2-4961-b441-1843fe9f5c29",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "49c2e2f0-1e6d-4969-b583-8fc15b8930f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "16ae5d5c-a0a7-400b-8e38-231c72ad27b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "pattern_dates = re.compile(r'(\\d{1,2}\\.)?(\\d{1,2}\\.)?([\\d]{2,4})?')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "3b9fe636-f895-404a-819d-61198d34262d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Am  war ich essen. Am  hingegen nicht. Und  war ich allein.'"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string = 'Am 11.02.2024 war ich essen. Am 11.12. hingegen nicht. Und 2024 war ich allein.'\n",
    "string = pattern_dates.sub('', string)\n",
    "string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7c49ab3c-e860-42af-ac0c-2f44f075e846",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 10,
--- a/pdm.lock
+++ b/pdm.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -15,6 +15,7 @@ dependencies = [
    "typing-extensions>=4.12.2",
    "tqdm>=4.67.0",
    "python-dateutil>=2.9.0.post0",
    "onnx==1.16.1",
 ]
 requires-python = ">=3.11"
 readme = "README.md"
@ -33,6 +34,18 @@ plot = [
 cytoscape = [
    "py4cytoscape>=1.11.0",
 ]
 spacy-trf = [
    "de-dep-news-trf @ https://github.com/explosion/spacy-models/releases/download/de_dep_news_trf-3.8.0/de_dep_news_trf-3.8.0-py3-none-any.whl",
 ]
 spacy-sm = [
    "de-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl",
 ]
 spacy-md = [
    "de-core-news-md @ https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.8.0/de_core_news_md-3.8.0-py3-none-any.whl",
 ]
 spacy-lg = [
    "de-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.8.0/de_core_news_lg-3.8.0-py3-none-any.whl",
 ]
 [build-system]
 requires = ["pdm-backend"]
 build-backend = "pdm.backend"
@ -57,6 +70,8 @@ dev = [
    "cython>=3.0.10",
    "openpyxl>=3.1.5",
    "seaborn>=0.13.2",
    "pytest>=8.3.3",
    "pytest-cov>=6.0.0",
 ]
 [tool.ruff]
@ -74,3 +89,36 @@ select = ["E", "F", "I"]
 [tool.ruff.lint.isort]
 extra-standard-library = ["typing_extensions"]
 [tool.pytest.ini_options]
 addopts = [
    "-vvl",
    "--import-mode=importlib",
 ]
 testpaths = [
    "tests",
 ]
 filterwarnings = [
    'ignore:pkg_resources is deprecated as an API.:DeprecationWarning'
 ]
 markers = [
    "mload: marks tests with loading of language models (deselect with '-m \"not mload\"')",
 ]
 log_cli = true
 [tool.coverage.run]
 relative_files = true
 source = [
    "lang_main",
    "tests/",
 ]
 [tool.coverage.report]
 exclude_also = [
    "def __repr__",
    "def __str__",
    "@overload",
 ]
 [tool.coverage.html]
 directory = "reports/coverage"
--- a/python/README.txt
+++ b/python/README.txt
@ -0,0 +1 @@
 only used to simulate directory tree in final solution
--- a/src/lang_main/init
+++ b/src/lang_main/init
@ -1,51 +0,0 @@
 import inspect
 import logging
 import shutil
 import sys
 from pathlib import Path
 from time import gmtime
 from typing import Any, Final
 import warnings
 from lang_main.io import load_toml_config
 __all__ = [
    'CALLER_PATH',
 ]
 logging.Formatter.converter = gmtime
 LOG_FMT: Final[str] = '%(asctime)s | %(module)s:%(levelname)s | %(message)s'
 LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
 logging.basicConfig(
    stream=sys.stdout,
    format=LOG_FMT,
    datefmt=LOG_DATE_FMT,
 )
 CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
 USE_INTERNAL_CONFIG: Final[bool] = True
 pkg_dir = Path(__file__).parent
 cfg_path_internal = pkg_dir / CONFIG_FILENAME
 caller_file = Path(inspect.stack()[-1].filename)
 CALLER_PATH: Final[Path] = caller_file.parent.resolve()
 # load config data: internal/external
 if USE_INTERNAL_CONFIG:
    loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
 else:
    cfg_path_external = CALLER_PATH / CONFIG_FILENAME
    if not caller_file.exists():
        warnings.warn('Caller file could not be correctly retrieved.')
    if not cfg_path_external.exists():
        shutil.copy(cfg_path_internal, cfg_path_external)
        sys.exit(
            (
                'No config file was found. A new one with default values was created '
                'in the execution path. Please fill in the necessary values and '
                'restart the programm.'
            )
        )
    # raise NotImplementedError("External config data not implemented yet.")
    loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
 CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
--- a/src/lang_main/init.py
+++ b/src/lang_main/init.py
@ -1,14 +1,19 @@
 import logging
 import os
 from pathlib import Path
 from typing import Any, Final
 from lang_main.config import load_toml_config
 _has_py4cyto: bool = True
 try:
    import py4cytoscape as p4c
 except ImportError:
    _has_py4cyto = False
-from lang_main.io import load_toml_config
+# ** external packages config
 # ** Huggingface Hub caching
 os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = 'set'
 # ** py4cytoscape config
 if _has_py4cyto:
@ -20,6 +25,7 @@ if _has_py4cyto:
    p4c.py4cytoscape_logger.detail_logger.addHandler(logging.NullHandler())
 # ** lang-main config
 BASE_FOLDERNAME: Final[str] = 'lang-main'
 CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
 CYTO_STYLESHEET_FILENAME: Final[str] = r'cytoscape_config/lang_main.xml'
 PREFER_INTERNAL_CONFIG: Final[bool] = False
@ -75,27 +81,71 @@ def search_iterative(
        pattern to look for, first match will be returned,
        by default CONFIG_FILENAME
    stop_folder_name : str, optional
-        name of the last folder in the directory tree to search, by default 'python'
+        name of the last folder in the directory tree to search, by default None
    Returns
    -------
    Path | None
        Path if corresponding object was found, None otherwise
    """
-    cfg_path: Path | None = None
+    file_path: Path | None = None
    stop_folder_reached: bool = False
    for it in range(len(starting_path.parents)):
        search_path = starting_path.parents[it]  # do not look in library folder
        res = tuple(search_path.glob(glob_pattern))
        if res:
-            cfg_path = res[0]
+            file_path = res[0]
            break
        elif stop_folder_reached:
            break
        if stop_folder_name is not None and search_path.name == stop_folder_name:
            # library is placed inside a whole python installation for deployment
            # if this folder is reached, only look up one parent above
            stop_folder_reached = True
    return file_path
 def search_base_path(
    starting_path: Path,
    stop_folder_name: str | None = None,
 ) -> Path | None:
    """Iteratively searches the parent directories of the starting path
    and look for folders matching the given name. If a match is encountered,
    the parent path will be returned.
    Example:
    starting_path = path/to/start/folder
    stop_folder_name = 'to'
    returned path = 'path/'
    Parameters
    ----------
    starting_path : Path
        non-inclusive starting path
    stop_folder_name : str, optional
        name of the last folder in the directory tree to search, by default None
    Returns
    -------
    Path | None
        Path if corresponding base path was found, None otherwise
    """
    stop_folder_path: Path | None = None
    base_path: Path | None = None
    for it in range(len(starting_path.parents)):
        search_path = starting_path.parents[it]  # do not look in library folder
        if stop_folder_name is not None and search_path.name == stop_folder_name:
            # library is placed inside a whole python installation for deployment
            # only look up to this folder
            stop_folder_path = search_path
            break
-    return cfg_path
+    if stop_folder_path is not None:
        base_path = stop_folder_path.parent
    return base_path
 def load_cfg() -> dict[str, Any]:
@ -121,6 +171,10 @@ def load_cfg() -> dict[str, Any]:
 CONFIG: Final[dict[str, Any]] = load_cfg()
 base_parent_path = search_base_path(pkg_dir, stop_folder_name=BASE_FOLDERNAME)
 if base_parent_path is None:
    raise FileNotFoundError('Could not resolve base path of library')
 BASE_PATH: Final[Path] = base_parent_path
 # ** Cytoscape configuration
--- a/src/lang_main/analysis/graphs.py
+++ b/src/lang_main/analysis/graphs.py
@ -48,9 +48,9 @@ def save_to_GraphML(
 def get_graph_metadata(
    graph: Graph | DiGraph,
    logging: bool = LOGGING_DEFAULT_GRAPHS,
-) -> dict[str, int]:
+) -> dict[str, float]:
    # info about graph
-    graph_info: dict[str, int] = {}
+    graph_info: dict[str, float] = {}
    # nodes and edges
    num_nodes = len(graph.nodes)
    num_edges = len(graph.edges)
@ -96,15 +96,6 @@ def update_graph(
    child: Hashable | None = None,
    weight_connection: int | None = None,
 ) -> None:
    # !! not necessary to check for existence of nodes
    # !! feature already implemented in NetworkX ``add_edge``
    """
    # check if nodes already in Graph
    if parent not in graph:
        graph.add_node(parent)
    if child not in graph:
        graph.add_node(child)
    """
    if weight_connection is None:
        weight_connection = 1
    # check if edge not in Graph
@ -115,9 +106,7 @@ def update_graph(
        graph.add_edge(parent, child, weight=weight_connection)
    else:
        # update edge
-        weight = graph[parent][child]['weight']
+        graph[parent][child]['weight'] += weight_connection
        weight += weight_connection
        graph[parent][child]['weight'] = weight
 # build undirected adjacency matrix
@ -249,7 +238,8 @@ def filter_graph_by_node_degree(
    bound_lower: int | None,
    bound_upper: int | None,
 ) -> TokenGraph:
-    """filters all nodes which are within the provided bounds by their degree
+    """filters all nodes which are within the provided bounds by their degree,
    inclusive limits: bound_lower <= node_degree <= bound_upper are retained
    Parameters
    ----------
@ -266,13 +256,14 @@ def filter_graph_by_node_degree(
    # filter nodes by degree
    original_graph_nodes = copy.deepcopy(graph.nodes)
    filtered_graph = graph.copy()
    filtered_graph_degree = copy.deepcopy(filtered_graph.degree)
    if not any([bound_lower, bound_upper]):
        logger.warning('No bounds provided, returning original graph.')
        return filtered_graph
    for node in original_graph_nodes:
-        degree = filtered_graph.degree[node]  # type: ignore
+        degree = cast(int, filtered_graph_degree[node])  # type: ignore
        if bound_lower is not None and degree < bound_lower:
            filtered_graph.remove_node(node)
        if bound_upper is not None and degree > bound_upper:
@ -540,9 +531,9 @@ class TokenGraph(DiGraph):
        self._name = name
        # directed and undirected graph data
        self._directed = self
-        self._metadata_directed: dict[str, int] = {}
+        self._metadata_directed: dict[str, float] = {}
        self._undirected: Graph | None = None
-        self._metadata_undirected: dict[str, int] = {}
+        self._metadata_undirected: dict[str, float] = {}
        # indicate rescaled weights
        self.rescaled_weights: bool = False
@ -568,12 +559,12 @@ class TokenGraph(DiGraph):
        return hash(self.__key())
    """
-    def copy(self) -> Self:
+    def copy(self) -> TokenGraph:
        """returns a (deep) copy of the graph
        Returns
        -------
-        Self
+        TokenGraph
            deep copy of the graph
        """
        return copy.deepcopy(self)
@ -594,11 +585,11 @@ class TokenGraph(DiGraph):
        return self._undirected
    @property
-    def metadata_directed(self) -> dict[str, int]:
+    def metadata_directed(self) -> dict[str, float]:
        return self._metadata_directed
    @property
-    def metadata_undirected(self) -> dict[str, int]:
+    def metadata_undirected(self) -> dict[str, float]:
        return self._metadata_undirected
    @overload
--- a/src/lang_main/analysis/preprocessing.py
+++ b/src/lang_main/analysis/preprocessing.py
@ -30,7 +30,7 @@ if TYPE_CHECKING:
 # ** (1) dataset preparation: loading and simple preprocessing
-# following functions used to load a given dataset and perform simple
+# following functions are used to load a given dataset and perform simple
 # duplicate cleansing based on all properties
 def load_raw_data(
    path: Path,
@ -277,41 +277,41 @@ def merge_similarity_dupl(
 # ** #################################################################################
 # TODO check removal
-def build_embedding_map(
+# def build_embedding_map(
-    data: Series,
+#     data: Series,
-    model: GermanSpacyModel | SentenceTransformer,
+#     model: GermanSpacyModel | SentenceTransformer,
-) -> tuple[dict[int, tuple[Embedding, str]], tuple[bool, bool]]:
+# ) -> tuple[dict[int, tuple[Embedding, str]], tuple[bool, bool]]:
-    # dictionary with embeddings
+#     # dictionary with embeddings
-    embeddings: dict[int, tuple[Embedding, str]] = {}
+#     embeddings: dict[int, tuple[Embedding, str]] = {}
-    is_spacy = False
+#     is_spacy = False
-    is_STRF = False
+#     is_STRF = False
-    if isinstance(model, GermanSpacyModel):
+#     if isinstance(model, GermanSpacyModel):
-        is_spacy = True
+#         is_spacy = True
-    elif isinstance(model, SentenceTransformer):
+#     elif isinstance(model, SentenceTransformer):
-        is_STRF = True
+#         is_STRF = True
-    if not any((is_spacy, is_STRF)):
+#     if not any((is_spacy, is_STRF)):
-        raise NotImplementedError('Model type unknown')
+#         raise NotImplementedError('Model type unknown')
-    for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
+#     for idx, text in tqdm(data.items(), total=len(data), mininterval=1.0):
-        # verbose code: Pyright not inferring types correctly
+#         # verbose code: Pyright not inferring types correctly
-        idx = cast(int, idx)
+#         idx = cast(int, idx)
-        text = cast(str, text)
+#         text = cast(str, text)
-        if is_spacy:
+#         if is_spacy:
-            model = cast(GermanSpacyModel, model)
+#             model = cast(GermanSpacyModel, model)
-            embd = cast(SpacyDoc, model(text))
+#             embd = cast(SpacyDoc, model(text))
-            embeddings[idx] = (embd, text)
+#             embeddings[idx] = (embd, text)
-            # check for empty vectors
+#             # check for empty vectors
-            if not embd.vector_norm:
+#             if not embd.vector_norm:
-                logger.debug('--- Unknown Words ---')
+#                 logger.debug('--- Unknown Words ---')
-                logger.debug('embd.text: %s has no vector', embd.text)
+#                 logger.debug('embd.text: %s has no vector', embd.text)
-        elif is_STRF:
+#         elif is_STRF:
-            model = cast(SentenceTransformer, model)
+#             model = cast(SentenceTransformer, model)
-            embd = cast(Tensor, model.encode(text, show_progress_bar=False))
+#             embd = cast(Tensor, model.encode(text, show_progress_bar=False))
-            embeddings[idx] = (embd, text)
+#             embeddings[idx] = (embd, text)
-    return embeddings, (is_spacy, is_STRF)
+#     return embeddings, (is_spacy, is_STRF)
 # adapt interface
@ -320,276 +320,275 @@ def build_embedding_map(
 # build similarity matrix out of embeddings
-def build_cosSim_matrix(
+# def build_cosSim_matrix(
-    data: Series,
+#     data: Series,
-    model: GermanSpacyModel | SentenceTransformer,
+#     model: GermanSpacyModel | SentenceTransformer,
-) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
+# ) -> tuple[DataFrame, dict[int, tuple[Embedding, str]]]:
-    # build empty matrix
+#     # build empty matrix
-    df_index = data.index
+#     df_index = data.index
-    cosineSim_idx_matrix = pd.DataFrame(
+#     cosineSim_idx_matrix = pd.DataFrame(
-        data=0.0, columns=df_index, index=df_index, dtype=np.float32
+#         data=0.0, columns=df_index, index=df_index, dtype=np.float32
-    )
+#     )
-    logger.info('Start building embedding map...')
+#     logger.info('Start building embedding map...')
-    # obtain embeddings based on used model
+#     # obtain embeddings based on used model
-    embds, (is_spacy, is_STRF) = build_embedding_map(
+#     embds, (is_spacy, is_STRF) = build_embedding_map(
-        data=data,
+#         data=data,
-        model=model,
+#         model=model,
-    )
+#     )
-    logger.info('Embedding map built successfully.')
+#     logger.info('Embedding map built successfully.')
-    # apply index based mapping for efficient handling of large texts
+#     # apply index based mapping for efficient handling of large texts
-    combs = combinations(df_index, 2)
+#     combs = combinations(df_index, 2)
-    total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)
+#     total_combs = factorial(len(df_index)) // factorial(2) // factorial(len(df_index) - 2)
-    logger.info('Start calculation of similarity scores...')
+#     logger.info('Start calculation of similarity scores...')
-    for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
+#     for idx1, idx2 in tqdm(combs, total=total_combs, mininterval=1.0):
-        # print(f"{idx1=}, {idx2=}")
+#         # print(f"{idx1=}, {idx2=}")
-        embd1 = embds[idx1][0]
+#         embd1 = embds[idx1][0]
-        embd2 = embds[idx2][0]
+#         embd2 = embds[idx2][0]
-        # calculate similarity based on model type
+#         # calculate similarity based on model type
-        if is_spacy:
+#         if is_spacy:
-            embd1 = cast(SpacyDoc, embds[idx1][0])
+#             embd1 = cast(SpacyDoc, embds[idx1][0])
-            embd2 = cast(SpacyDoc, embds[idx2][0])
+#             embd2 = cast(SpacyDoc, embds[idx2][0])
-            cosSim = embd1.similarity(embd2)
+#             cosSim = embd1.similarity(embd2)
-        elif is_STRF:
+#         elif is_STRF:
-            embd1 = cast(Tensor, embds[idx1][0])
+#             embd1 = cast(Tensor, embds[idx1][0])
-            embd2 = cast(Tensor, embds[idx2][0])
+#             embd2 = cast(Tensor, embds[idx2][0])
-            cosSim = sentence_transformers.util.cos_sim(embd1, embd2)
+#             cosSim = sentence_transformers.util.cos_sim(embd1, embd2)
-            cosSim = cast(float, cosSim.item())
+#             cosSim = cast(float, cosSim.item())
-        cosineSim_idx_matrix.at[idx1, idx2] = cosSim
+#         cosineSim_idx_matrix.at[idx1, idx2] = cosSim
-    logger.info('Similarity scores calculated successfully.')
+#     logger.info('Similarity scores calculated successfully.')
-    return cosineSim_idx_matrix, embds
+#     return cosineSim_idx_matrix, embds
 # obtain index pairs with cosine similarity
 # greater than or equal to given threshold value
-def filt_thresh_cosSim_matrix(
+# def filt_thresh_cosSim_matrix(
-    cosineSim_idx_matrix: DataFrame,
+#     cosineSim_idx_matrix: DataFrame,
-    embds: dict[int, tuple[Embedding, str]],
+#     embds: dict[int, tuple[Embedding, str]],
-    threshold: float,
+#     threshold: float,
-) -> tuple[Series, dict[int, tuple[Embedding, str]]]:
+# ) -> tuple[Series, dict[int, tuple[Embedding, str]]]:
-    """filter similarity matrix by threshold value and return index pairs with
+#     """filter similarity matrix by threshold value and return index pairs with
-    a similarity score greater than the provided threshold
+#     a similarity score greater than the provided threshold
-    Parameters
+#     Parameters
-    ----------
+#     ----------
-    threshold : float
+#     threshold : float
-        similarity threshold
+#         similarity threshold
-    cosineSim_idx_matrix : DataFrame
+#     cosineSim_idx_matrix : DataFrame
-        similarity matrix
+#         similarity matrix
-    Returns
+#     Returns
-    -------
+#     -------
-    Series
+#     Series
-        series with multi index (index pairs) and corresponding similarity score
+#         series with multi index (index pairs) and corresponding similarity score
-    """
+#     """
-    cosineSim_filt = cast(
+#     cosineSim_filt = cast(
-        Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
+#         Series, cosineSim_idx_matrix.where(cosineSim_idx_matrix >= threshold).stack()
-    )
+#     )
-    return cosineSim_filt, embds
+#     return cosineSim_filt, embds
-def list_cosSim_dupl_candidates(
+# def list_cosSim_dupl_candidates(
-    cosineSim_filt: Series,
+#     cosineSim_filt: Series,
-    embds: dict[int, tuple[Embedding, str]],
+#     embds: dict[int, tuple[Embedding, str]],
-    save_candidates: bool = False,
+#     save_candidates: bool = False,
-    saving_path: Path | None = None,
+#     saving_path: Path | None = None,
-    filename: str = 'CosSim-FilterCandidates',
+#     filename: str = 'CosSim-FilterCandidates',
-    pipeline: Pipeline | None = None,
+#     pipeline: Pipeline | None = None,
-) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]:
+# ) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]:
-    """providing an overview of candidates with a similarity score greater than
+#     """providing an overview of candidates with a similarity score greater than
-    given threshold; more suitable for debugging purposes
+#     given threshold; more suitable for debugging purposes
-    Returns
+#     Returns
-    -------
+#     -------
-    DataFrame
+#     DataFrame
-        contains indices, corresponding texts and similarity score to evaluate results
+#         contains indices, corresponding texts and similarity score to evaluate results
-    list[tuple[Index, Index]]
+#     list[tuple[Index, Index]]
-        list containing relevant index pairs for entries with similarity score greater than
+#         list containing relevant index pairs for entries with similarity score greater than
-        given threshold
+#         given threshold
-    """
+#     """
-    logger.info('Start gathering of similarity candidates...')
+#     logger.info('Start gathering of similarity candidates...')
-    # compare found duplicates
+#     # compare found duplicates
-    columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
+#     columns: list[str] = ['idx1', 'text1', 'idx2', 'text2', 'score']
-    df_candidates = pd.DataFrame(columns=columns)
+#     df_candidates = pd.DataFrame(columns=columns)
-    index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
+#     index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
-    for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)):  # type: ignore
+#     for (idx1, idx2), score in tqdm(cosineSim_filt.items(), total=len(cosineSim_filt)):  # type: ignore
-        # get text content from embedding as second tuple entry
+#         # get text content from embedding as second tuple entry
-        content = [
+#         content = [
-            [
+#             [
-                idx1,
+#                 idx1,
-                embds[idx1][1],
+#                 embds[idx1][1],
-                idx2,
+#                 idx2,
-                embds[idx2][1],
+#                 embds[idx2][1],
-                score,
+#                 score,
-            ]
+#             ]
-        ]
+#         ]
-        # add candidates to collection DataFrame
+#         # add candidates to collection DataFrame
-        df_conc = pd.DataFrame(columns=columns, data=content)
+#         df_conc = pd.DataFrame(columns=columns, data=content)
-        if df_candidates.empty:
+#         if df_candidates.empty:
-            df_candidates = df_conc.copy()
+#             df_candidates = df_conc.copy()
-        else:
+#         else:
-            df_candidates = pd.concat([df_candidates, df_conc])
+#             df_candidates = pd.concat([df_candidates, df_conc])
-        # save index pairs
+#         # save index pairs
-        index_pairs.append((idx1, idx2))
+#         index_pairs.append((idx1, idx2))
-    logger.info('Similarity candidates gathered successfully.')
+#     logger.info('Similarity candidates gathered successfully.')
-    if save_candidates:
+#     if save_candidates:
-        if saving_path is None:
+#         if saving_path is None:
-            raise ValueError(
+#             raise ValueError(
-                ('Saving path must be provided if duplicate ' 'candidates should be saved.')
+#                 ('Saving path must be provided if duplicate ' 'candidates should be saved.')
-            )
+#             )
-        elif pipeline is not None:
+#         elif pipeline is not None:
-            target_filename = (
+#             target_filename = (
-                f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
+#                 f'Pipe-{pipeline.name}_Step_{pipeline.curr_proc_idx}_' + filename + '.xlsx'
-            )
+#             )
-        elif pipeline is None:
+#         elif pipeline is None:
-            target_filename = f'{filename}.xlsx'
+#             target_filename = f'{filename}.xlsx'
-        logger.info('Saving similarity candidates...')
+#         logger.info('Saving similarity candidates...')
-        target_path = saving_path.joinpath(target_filename)
+#         target_path = saving_path.joinpath(target_filename)
-        df_candidates.to_excel(target_path)
+#     df_candidates.to_excel(target_path)
-        logger.info('Similarity candidates saved successfully to >>%s<<.', target_path)
+#     logger.info('Similarity candidates saved successfully to >>%s<<.', target_path)
-    return index_pairs, embds
+# return index_pairs, embds
 # TODO: change implementation fully to SentenceTransformer
 # usage of batch processing for embeddings, use candidate idx function
 # from time analysis --> moved to ``helpers.py``
 """
 def similar_ids_connection_graph(
    similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
 ) -> tuple[Graph, dict[str, int]]:
    # build index graph to obtain graph of connected (similar) indices
    # use this graph to get connected components (indices which belong together)
    # retain semantic connection on whole dataset
    similar_id_graph = nx.Graph()
    for (idx1, idx2) in similar_idx_pairs:
        # inplace operation, parent/child do not really exist in undirected graph
        update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
-    graph_info = get_graph_metadata(graph=similar_id_graph, logging=True)
+# def similar_ids_connection_graph(
 #     similar_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
 # ) -> tuple[Graph, dict[str, int]]:
 #     # build index graph to obtain graph of connected (similar) indices
 #     # use this graph to get connected components (indices which belong together)
 #     # retain semantic connection on whole dataset
 #     similar_id_graph = nx.Graph()
 #     for (idx1, idx2) in similar_idx_pairs:
 #         # inplace operation, parent/child do not really exist in undirected graph
 #         update_graph(graph=similar_id_graph, parent=idx1, child=idx2)
-    return similar_id_graph, graph_info
+#     graph_info = get_graph_metadata(graph=similar_id_graph, logging=True)
-def similar_ids_groups(
+#     return similar_id_graph, graph_info
    dupl_id_graph: Graph,
 ) -> Iterator[list[PandasIndex]]:
    # groups of connected indices
    ids_groups = cast(Iterator[set[PandasIndex]],
                      nx.connected_components(G=dupl_id_graph))
-    for id_group in ids_groups:
+# def similar_ids_groups(
-        yield list(id_group)
+#     dupl_id_graph: Graph,
-"""
+# ) -> Iterator[list[PandasIndex]]:
 #     # groups of connected indices
 #     ids_groups = cast(Iterator[set[PandasIndex]],
 #                       nx.connected_components(G=dupl_id_graph))
 #     for id_group in ids_groups:
 #         yield list(id_group)
-# merge duplicates
+# # merge duplicates
-def merge_similarity_dupl_old(
+# def merge_similarity_dupl_old(
-    data: DataFrame,
+#     data: DataFrame,
-    dupl_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
+#     dupl_idx_pairs: list[tuple[PandasIndex, PandasIndex]],
-) -> tuple[DataFrame]:
+# ) -> tuple[DataFrame]:
-    # copy pre-cleaned data
+#     # copy pre-cleaned data
-    temp = data.copy()
+#     temp = data.copy()
-    index = temp.index
+#     index = temp.index
-    # logger.info("Start merging of similarity candidates...")
+#     # logger.info("Start merging of similarity candidates...")
-    # iterate over index pairs
+#     # iterate over index pairs
-    for i1, i2 in tqdm(dupl_idx_pairs):
+#     for i1, i2 in tqdm(dupl_idx_pairs):
-        # if an entry does not exist any more, skip this pair
+#         # if an entry does not exist any more, skip this pair
-        if i1 not in index or i2 not in index:
+#         if i1 not in index or i2 not in index:
-            continue
+#             continue
-        # merge num occur
+#         # merge num occur
-        num_occur1 = temp.at[i1, 'num_occur']
+#         num_occur1 = temp.at[i1, 'num_occur']
-        num_occur2 = temp.at[i2, 'num_occur']
+#         num_occur2 = temp.at[i2, 'num_occur']
-        new_num_occur = num_occur1 + num_occur2
+#         new_num_occur = num_occur1 + num_occur2
-        # merge associated object ids
+#         # merge associated object ids
-        assoc_ids1 = temp.at[i1, 'assoc_obj_ids']
+#         assoc_ids1 = temp.at[i1, 'assoc_obj_ids']
-        assoc_ids2 = temp.at[i2, 'assoc_obj_ids']
+#         assoc_ids2 = temp.at[i2, 'assoc_obj_ids']
-        new_assoc_ids = np.append(assoc_ids1, assoc_ids2)
+#         new_assoc_ids = np.append(assoc_ids1, assoc_ids2)
-        new_assoc_ids = np.unique(new_assoc_ids.flatten())
+#         new_assoc_ids = np.unique(new_assoc_ids.flatten())
-        # recalculate num associated obj ids
+#         # recalculate num associated obj ids
-        new_num_assoc_obj_ids = len(new_assoc_ids)
+#         new_num_assoc_obj_ids = len(new_assoc_ids)
-        # write properties to first entry
+#         # write properties to first entry
-        temp.at[i1, 'num_occur'] = new_num_occur
+#         temp.at[i1, 'num_occur'] = new_num_occur
-        temp.at[i1, 'assoc_obj_ids'] = new_assoc_ids
+#         temp.at[i1, 'assoc_obj_ids'] = new_assoc_ids
-        temp.at[i1, 'num_assoc_obj_ids'] = new_num_assoc_obj_ids
+#         temp.at[i1, 'num_assoc_obj_ids'] = new_num_assoc_obj_ids
-        # drop second entry
+#         # drop second entry
-        temp = temp.drop(index=i2)
+#         temp = temp.drop(index=i2)
-        index = temp.index
+#         index = temp.index
-    # logger.info("Similarity candidates merged successfully.")
+#     # logger.info("Similarity candidates merged successfully.")
-    return (temp,)
+#     return (temp,)
 # ** debugging and evaluation
-def choose_cosSim_dupl_candidates(
+# def choose_cosSim_dupl_candidates(
-    cosineSim_filt: Series,
+#     cosineSim_filt: Series,
-    embds: dict[int, tuple[Embedding, str]],
+#     embds: dict[int, tuple[Embedding, str]],
-) -> tuple[DataFrame, list[tuple[PandasIndex, PandasIndex]]]:
+# ) -> tuple[DataFrame, list[tuple[PandasIndex, PandasIndex]]]:
-    """providing an overview of candidates with a similarity score greater than
+#     """providing an overview of candidates with a similarity score greater than
-    given threshold, but decision is made manually by iterating through the candidates
+#     given threshold, but decision is made manually by iterating through the candidates
-    with user interaction; more suitable for debugging purposes
+#     with user interaction; more suitable for debugging purposes
-    Returns
+#     Returns
-    -------
+#     -------
-    DataFrame
+#     DataFrame
-        contains indices, corresponding texts and similarity score to evaluate results
+#         contains indices, corresponding texts and similarity score to evaluate results
-    list[tuple[Index, Index]]
+#     list[tuple[Index, Index]]
-        list containing relevant index pairs for entries with similarity score greater than
+#         list containing relevant index pairs for entries with similarity score greater than
-        given threshold
+#         given threshold
-    """
+#     """
-    # compare found duplicates
+#     # compare found duplicates
-    columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
+#     columns = ['idx1', 'text1', 'idx2', 'text2', 'score']
-    df_candidates = pd.DataFrame(columns=columns)
+#     df_candidates = pd.DataFrame(columns=columns)
-    index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
+#     index_pairs: list[tuple[PandasIndex, PandasIndex]] = []
-    for (idx1, idx2), score in cosineSim_filt.items():  # type: ignore
+#     for (idx1, idx2), score in cosineSim_filt.items():  # type: ignore
-        # get texts for comparison
+#         # get texts for comparison
-        text1 = embds[idx1][1]
+#         text1 = embds[idx1][1]
-        text2 = embds[idx2][1]
+#         text2 = embds[idx2][1]
-        # get decision
+#         # get decision
-        print('---------- New Decision ----------')
+#         print('---------- New Decision ----------')
-        print('text1:\n', text1, '\n', flush=True)
+#         print('text1:\n', text1, '\n', flush=True)
-        print('text2:\n', text2, '\n', flush=True)
+#         print('text2:\n', text2, '\n', flush=True)
-        decision = input('Please enter >>y<< if this is a duplicate, else hit enter:')
+#         decision = input('Please enter >>y<< if this is a duplicate, else hit enter:')
-        if not decision == 'y':
+#         if not decision == 'y':
-            continue
+#             continue
-        # get text content from embedding as second tuple entry
+#         # get text content from embedding as second tuple entry
-        content = [
+#         content = [
-            [
+#             [
-                idx1,
+#                 idx1,
-                text1,
+#                 text1,
-                idx2,
+#                 idx2,
-                text2,
+#                 text2,
-                score,
+#                 score,
-            ]
+#             ]
-        ]
+#         ]
-        df_conc = pd.DataFrame(columns=columns, data=content)
+#         df_conc = pd.DataFrame(columns=columns, data=content)
-        df_candidates = pd.concat([df_candidates, df_conc])
+#         df_candidates = pd.concat([df_candidates, df_conc])
-        index_pairs.append((idx1, idx2))
+#         index_pairs.append((idx1, idx2))
-    return df_candidates, index_pairs
+#     return df_candidates, index_pairs
--- a/src/lang_main/analysis/shared.py
+++ b/src/lang_main/analysis/shared.py
@ -22,7 +22,7 @@ pattern_escape_newline = re.compile(r'[\n]+')
 pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
 pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
 pattern_repeated_chars = re.compile(r'[,;.:!?\-_+]+(?=[,;.:!?\-_+])')
-pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
+pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)?([\d]{2,4})?')
 pattern_whitespace = re.compile(r'[ ]{2,}')
@ -43,7 +43,7 @@ def clean_string_slim(string: str) -> str:
        cleaned entry
    """
    # remove special chars
-    string = pattern_escape_newline.sub('. ', string)
+    # string = pattern_escape_newline.sub(' ', string)
    string = pattern_escape_seq.sub(' ', string)
    string = pattern_repeated_chars.sub('', string)
    # string = pattern_dates.sub('', string)
@ -127,7 +127,7 @@ def candidates_by_index(
 def similar_index_connection_graph(
    similar_idx_pairs: Iterable[tuple[PandasIndex, PandasIndex]],
-) -> tuple[Graph, dict[str, int]]:
+) -> tuple[Graph, dict[str, float]]:
    # build index graph to obtain graph of connected (similar) indices
    # use this graph to get connected components (indices which belong together)
    # retain semantic connection on whole dataset
--- a/src/lang_main/config.py
+++ b/src/lang_main/config.py
@ -0,0 +1,17 @@
 from __future__ import annotations
 import sys
 import tomllib
 from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
    from pathlib import Path
 def load_toml_config(
    path_to_toml: str | Path,
 ) -> dict[str, Any]:
    with open(path_to_toml, 'rb') as f:
        data = tomllib.load(f)
    print('Loaded TOML config file successfully.', file=sys.stderr, flush=True)
    return data
--- a/src/lang_main/constants.py
+++ b/src/lang_main/constants.py
@ -2,22 +2,21 @@ from enum import Enum  # noqa: I001
 from importlib.util import find_spec
 from pathlib import Path
 from typing import Final
 import os
 from sentence_transformers import SimilarityFunction
-from lang_main import CONFIG, CYTO_PATH_STYLESHEET
+from lang_main import CONFIG, CYTO_PATH_STYLESHEET, BASE_PATH
 from lang_main import model_loader as m_load
 from lang_main.types import (
    CytoLayoutProperties,
    CytoLayouts,
    LanguageModels,
    ModelLoaderMap,
    ONNXExecutionProvider,  # noqa: F401
    STFRBackends,
    STFRDeviceTypes,
    STFRModelArgs,
-    STFRModels,
+    STFRModelTypes,
    STFRQuantFilenames,  # noqa: F401
    SpacyModelTypes,
 )
 __all__ = [
@ -67,35 +66,29 @@ SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
 # ** models
 # ** loading
-SPACY_MODEL_NAME: Final[str] = 'de_dep_news_trf'
+MODEL_BASE_FOLDER_NAME: Final[str] = 'lang-models'
-STFR_MODEL_NAME: Final[STFRModels] = STFRModels.ALL_MPNET_BASE_V2
+MODEL_BASE_FOLDER: Final[Path] = BASE_PATH / MODEL_BASE_FOLDER_NAME
 if not MODEL_BASE_FOLDER.exists():
    raise FileNotFoundError('Language model folder not found.')
 os.environ['SENTENCE_TRANSFORMERS_HOME'] = str(MODEL_BASE_FOLDER)
 SPACY_MODEL_NAME: Final[SpacyModelTypes] = SpacyModelTypes.DE_DEP_NEWS_TRF
 STFR_MODEL_NAME: Final[STFRModelTypes] = STFRModelTypes.ALL_MPNET_BASE_V2
 STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU
 STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE
 STFR_BACKEND: Final[STFRBackends] = STFRBackends.TORCH
-STFR_MODEL_ARGS: Final[STFRModelArgs] = {}
+STFR_MODEL_ARGS_DEFAULT: STFRModelArgs = {}
-# STFR_MODEL_ARGS: Final[STFRModelArgs] = {
+STFR_MODEL_ARGS_ONNX: STFRModelArgs = {
-#     'file_name': STFRQuantFilenames.ONNX_Q_UINT8,
+    'file_name': STFRQuantFilenames.ONNX_Q_UINT8,
-#     'provider': ONNXExecutionProvider.CPU,
+    'provider': ONNXExecutionProvider.CPU,
-#     'export': False,
+    'export': False,
 # }
 MODEL_LOADER_MAP: Final[ModelLoaderMap] = {
    LanguageModels.SENTENCE_TRANSFORMER: {
        'func': m_load.load_sentence_transformer,
        'kwargs': {
            'model_name': STFR_MODEL_NAME,
            'similarity_func': STFR_SIMILARITY,
            'backend': STFR_BACKEND,
            'device': STFR_DEVICE,
            'model_kwargs': STFR_MODEL_ARGS,
        },
    },
    LanguageModels.SPACY: {
        'func': m_load.load_spacy,
        'kwargs': {
            'model_name': SPACY_MODEL_NAME,
        },
    },
 }
 stfr_model_args: STFRModelArgs
 if STFR_BACKEND == STFRBackends.ONNX:
    stfr_model_args = STFR_MODEL_ARGS_ONNX
 else:
    stfr_model_args = STFR_MODEL_ARGS_DEFAULT
 STFR_MODEL_ARGS: Final[STFRModelArgs] = stfr_model_args
 # ** language dependency analysis
 # ** POS
 # POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
--- a/src/lang_main/errors.py
+++ b/src/lang_main/errors.py
@ -1,3 +1,9 @@
 # ** meta exceptions
 class LanguageModelNotFoundError(Exception):
    """Error raised if a given language model could not be loaded successfully"""
 # ** token graph exceptions
 class EdgePropertyNotContainedError(Exception):
    """Error raised if a needed edge property is not contained in graph edges"""
@ -21,8 +27,6 @@ class DependencyMissingError(Exception):
 # ** pipelines to perform given actions on dataset in a customisable manner
 class NoPerformableActionError(Exception):
    """Error describing that no action is available in the current pipeline"""
--- a/src/lang_main/io.py
+++ b/src/lang_main/io.py
@ -1,7 +1,6 @@
 import base64
 import pickle
 import shutil
 import tomllib
 from pathlib import Path
 from typing import Any
@ -33,15 +32,6 @@ def create_saving_folder(
            )
 def load_toml_config(
    path_to_toml: str | Path,
 ) -> dict[str, Any]:
    with open(path_to_toml, 'rb') as f:
        data = tomllib.load(f)
    logger.info('Loaded TOML config file successfully.')
    return data
 # saving and loading using pickle
 # careful: pickling from unknown sources can be dangerous
 def save_pickle(
--- a/src/lang_main/lang_main_config.toml
+++ b/src/lang_main/lang_main_config.toml
@ -1,4 +1,6 @@
 # lang_main: Config file
 [info]
 pkg = 'lang_main'
 [paths]
 inputs = './inputs/'
--- a/src/lang_main/loggers.py
+++ b/src/lang_main/loggers.py
@ -5,6 +5,7 @@ from time import gmtime
 from typing import Final
 from lang_main.constants import (
    BASE_PATH,
    ENABLE_LOGGING,
    LOGGING_TO_FILE,
    LOGGING_TO_STDERR,
@ -15,11 +16,11 @@ from lang_main.types import LoggingLevels
 logging.Formatter.converter = gmtime
 LOG_FMT: Final[str] = '%(asctime)s | lang_main:%(module)s:%(levelname)s | %(message)s'
 LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
-LOG_FILE_PATH: Final[Path] = Path.cwd() / 'lang-main.log'
+LOG_FILE_FOLDER: Final[Path] = BASE_PATH / 'logs'
-# logging.basicConfig(
+if not LOG_FILE_FOLDER.exists():
-#     format=LOG_FMT,
+    LOG_FILE_FOLDER.mkdir(parents=True)
-#     datefmt=LOG_DATE_FMT,
+
-# )
+LOG_FILE_PATH: Final[Path] = LOG_FILE_FOLDER / 'lang-main.log'
 # ** formatters
 logger_all_formater = logging.Formatter(fmt=LOG_FMT, datefmt=LOG_DATE_FMT)
--- a/src/lang_main/model_loader.py
+++ b/src/lang_main/model_loader.py
@ -1,16 +1,25 @@
 from __future__ import annotations
 import importlib
 from typing import (
    TYPE_CHECKING,
    Any,
    Final,
    Literal,
    cast,
    overload,
 )
-import spacy
+from sentence_transformers import SentenceTransformer, SimilarityFunction
 from sentence_transformers import SentenceTransformer
-from lang_main.constants import STFR_SIMILARITY
+from lang_main.constants import (
    SPACY_MODEL_NAME,
    STFR_BACKEND,
    STFR_DEVICE,
    STFR_MODEL_ARGS,
    STFR_MODEL_NAME,
    STFR_SIMILARITY,
 )
 from lang_main.errors import LanguageModelNotFoundError
 from lang_main.types import (
    LanguageModels,
    Model,
@ -20,9 +29,6 @@ from lang_main.types import (
    STFRDeviceTypes,
 )
 if TYPE_CHECKING:
    from sentence_transformers import SimilarityFunction
@overload
 def instantiate_model(
@ -53,14 +59,27 @@ def instantiate_model(
 def load_spacy(
    model_name: str,
 ) -> SpacyModel:
-    return spacy.load(model_name)
+    try:
        spacy_model_obj = importlib.import_module(SPACY_MODEL_NAME)
    except ModuleNotFoundError:
        raise LanguageModelNotFoundError(
            (
                f'Could not find spaCy model >>{model_name}<<. '
                f'Check if it is installed correctly.'
            )
        )
    pretrained_model = cast(SpacyModel, spacy_model_obj.load())
    return pretrained_model
 def load_sentence_transformer(
    model_name: str,
-    similarity_func: SimilarityFunction = STFR_SIMILARITY,
+    similarity_func: SimilarityFunction = SimilarityFunction.COSINE,
    backend: STFRBackends = STFRBackends.TORCH,
    device: STFRDeviceTypes = STFRDeviceTypes.CPU,
    local_files_only: bool = False,
    model_save_folder: str | None = None,
    model_kwargs: dict[str, Any] | None = None,
 ) -> SentenceTransformer:
    return SentenceTransformer(
@ -68,5 +87,28 @@ def load_sentence_transformer(
        similarity_fn_name=similarity_func,
        backend=backend,  # type: ignore Literal matches Enum
        device=device,
        cache_folder=model_save_folder,
        local_files_only=local_files_only,
        model_kwargs=model_kwargs,
    )
 # ** configured model builder functions
 MODEL_LOADER_MAP: Final[ModelLoaderMap] = {
    LanguageModels.SENTENCE_TRANSFORMER: {
        'func': load_sentence_transformer,
        'kwargs': {
            'model_name': STFR_MODEL_NAME,
            'similarity_func': STFR_SIMILARITY,
            'backend': STFR_BACKEND,
            'device': STFR_DEVICE,
            'model_kwargs': STFR_MODEL_ARGS,
        },
    },
    LanguageModels.SPACY: {
        'func': load_spacy,
        'kwargs': {
            'model_name': SPACY_MODEL_NAME,
        },
    },
 }
--- a/src/lang_main/pipelines/predefined.py
+++ b/src/lang_main/pipelines/predefined.py
@ -30,7 +30,6 @@ from lang_main.constants import (
    DATE_COLS,
    FEATURE_NAME_OBJ_ID,
    MODEL_INPUT_FEATURES,
    MODEL_LOADER_MAP,
    NAME_DELTA_FEAT_TO_REPAIR,
    SAVE_PATH_FOLDER,
    THRESHOLD_AMOUNT_CHARACTERS,
@ -41,6 +40,7 @@ from lang_main.constants import (
    THRESHOLD_UNIQUE_TEXTS,
    UNIQUE_CRITERION_FEATURE,
 )
 from lang_main.model_loader import MODEL_LOADER_MAP
 from lang_main.pipelines.base import Pipeline
 from lang_main.types import EntryPoints, LanguageModels
--- a/src/lang_main/types.py
+++ b/src/lang_main/types.py
@ -45,13 +45,20 @@ class ONNXExecutionProvider(enum.StrEnum):
    CPU = 'CPUExecutionProvider'
-class STFRModels(enum.StrEnum):
+class STFRModelTypes(enum.StrEnum):
    ALL_MPNET_BASE_V2 = 'all-mpnet-base-v2'
    ALL_DISTILROBERTA_V1 = 'all-distilroberta-v1'
    ALL_MINI_LM_L12_V2 = 'all-MiniLM-L12-v2'
    ALL_MINI_LM_L6_V2 = 'all-MiniLM-L6-v2'
 class SpacyModelTypes(enum.StrEnum):
    DE_CORE_NEWS_SM = 'de_core_news_sm'
    DE_CORE_NEWS_MD = 'de_core_news_md'
    DE_CORE_NEWS_LG = 'de_core_news_lg'
    DE_DEP_NEWS_TRF = 'de_dep_news_trf'
 class STFRQuantFilenames(enum.StrEnum):
    ONNX_Q_UINT8 = 'onnx/model_quint8_avx2.onnx'
--- a/tests/Dummy_Dataset_N_1000.csv
+++ b/tests/Dummy_Dataset_N_1000.csv
--- a/tests/analyse_dataset.xlsx
+++ b/tests/analyse_dataset.xlsx
--- a/tests/analysis/init.py
+++ b/tests/analysis/init.py
--- a/tests/analysis/test_graphs.py
+++ b/tests/analysis/test_graphs.py
@ -0,0 +1,168 @@
 import networkx as nx
 import pytest
 from lang_main.analysis import graphs
 TK_GRAPH_NAME = 'TEST_TOKEN_GRAPH'
 def build_init_graph(token_graph: bool):
    edge_weights = [
        {'weight': 1},
        {'weight': 2},
        {'weight': 3},
        {'weight': 4},
        {'weight': 5},
        {'weight': 6},
    ]
    edges = [
        (1, 2),
        (1, 3),
        (2, 4),
        (3, 4),
        (1, 4),
        (2, 1),
    ]
    edges_to_add = []
    for i, edge in enumerate(edges):
        edge = list(edge)
        edge.append(edge_weights[i])  # type: ignore
        edges_to_add.append(tuple(edge))
    if token_graph:
        G = graphs.TokenGraph(name=TK_GRAPH_NAME, enable_logging=False)
    else:
        G = nx.DiGraph()
    G.add_edges_from(edges_to_add)
    return G
@pytest.fixture(scope='module')
 def graph():
    return build_init_graph(token_graph=False)
@pytest.fixture(scope='module')
 def tk_graph():
    return build_init_graph(token_graph=True)
 def test_graph_size(graph):
    assert len(graph.nodes) == 4
    assert len(graph.edges) == 6
 def test_save_to_GraphML(graph, tmp_path):
    filename = 'test_graphML'
    graphs.save_to_GraphML(graph, saving_path=tmp_path, filename=filename)
    saved_file = (tmp_path / filename).with_suffix('.graphml')
    assert saved_file.exists()
 def test_metadata_retrieval(graph):
    metadata = graphs.get_graph_metadata(graph)
    assert metadata['num_nodes'] == 4
    assert metadata['num_edges'] == 6
    assert metadata['min_edge_weight'] == 1
    assert metadata['max_edge_weight'] == 6
    assert metadata['node_memory'] == 112
    assert metadata['edge_memory'] == 336
    assert metadata['total_memory'] == 448
 def test_graph_update_batch():
    graph_obj = build_init_graph(token_graph=False)
    graphs.update_graph(graph_obj, batch=((4, 5), (5, 6)), weight_connection=8)
    metadata = graphs.get_graph_metadata(graph_obj)
    assert metadata['num_nodes'] == 6
    assert metadata['num_edges'] == 8
    assert metadata['min_edge_weight'] == 1
    assert metadata['max_edge_weight'] == 8
 def test_graph_update_single_new():
    graph_obj = build_init_graph(token_graph=False)
    graphs.update_graph(graph_obj, parent=4, child=5, weight_connection=7)
    metadata = graphs.get_graph_metadata(graph_obj)
    assert metadata['num_nodes'] == 5
    assert metadata['num_edges'] == 7
    assert metadata['min_edge_weight'] == 1
    assert metadata['max_edge_weight'] == 7
 def test_graph_update_single_existing():
    graph_obj = build_init_graph(token_graph=False)
    graphs.update_graph(graph_obj, parent=1, child=4, weight_connection=5)
    metadata = graphs.get_graph_metadata(graph_obj)
    assert metadata['num_nodes'] == 4
    assert metadata['num_edges'] == 6
    assert metadata['min_edge_weight'] == 1
    assert metadata['max_edge_weight'] == 10
@pytest.mark.parametrize('cast_int', [True, False])
 def test_graph_undirected_conversion(graph, cast_int):
    graph_undir = graphs.convert_graph_to_undirected(graph, cast_int=cast_int)
    # edges: (1, 2, w=1) und (2, 1, w=6) --> undirected: (1, 2, w=7)
    assert graph_undir[1][2]['weight'] == pytest.approx(7.0)
 def test_graph_cytoscape_conversion(graph):
    cyto_graph, weight_data = graphs.convert_graph_to_cytoscape(graph)
    node = cyto_graph[0]
    edge = cyto_graph[-1]
    assert node['data']['id'] == 1  # type: ignore
    assert edge['data']['source'] == 3  # type: ignore
    assert edge['data']['target'] == 4  # type: ignore
    assert edge['data']['weight'] == 4  # type: ignore
    assert weight_data['min'] == 1
    assert weight_data['max'] == 6
 def test_tk_graph_properties(tk_graph):
    assert tk_graph.name == TK_GRAPH_NAME
    assert isinstance(tk_graph.directed, graphs.TokenGraph)
    assert isinstance(tk_graph.undirected, nx.Graph)
    tk_graph.update_metadata()
    metadata_directed = tk_graph.metadata_directed
    assert metadata_directed['num_nodes'] == 4
    assert metadata_directed['num_edges'] == 6
    assert metadata_directed['min_edge_weight'] == 1
    assert metadata_directed['max_edge_weight'] == 6
    assert metadata_directed['node_memory'] == 112
    assert metadata_directed['edge_memory'] == 336
    assert metadata_directed['total_memory'] == 448
    metadata_undirected = tk_graph.metadata_undirected
    assert metadata_undirected['num_nodes'] == 4
    assert metadata_undirected['num_edges'] == 5
    assert metadata_undirected['min_edge_weight'] == 2
    assert metadata_undirected['max_edge_weight'] == 7
    assert metadata_undirected['node_memory'] == 112
    assert metadata_undirected['edge_memory'] == 280
    assert metadata_undirected['total_memory'] == 392
 def test_graph_degree_filter(tk_graph):
    filtered_graph = graphs.filter_graph_by_node_degree(
        tk_graph,
        bound_lower=3,
        bound_upper=3,
    )
    assert len(filtered_graph.nodes) == 2
 def test_graph_edge_number_filter(tk_graph):
    number_edges_limit = 1
    filtered_graph = graphs.filter_graph_by_number_edges(
        tk_graph,
        limit=number_edges_limit,
    )
    assert len(filtered_graph.edges) == number_edges_limit
    filtered_graph = graphs.filter_graph_by_node_degree(
        filtered_graph,
        bound_lower=1,
        bound_upper=None,
    )
    assert len(filtered_graph.nodes) == 2, 'one edge should result in only two nodes'
--- a/tests/analysis/test_preprocessing.py
+++ b/tests/analysis/test_preprocessing.py
@ -0,0 +1,73 @@
 """testing each function in a consecutive way like each one is
 executed in in a pipeline
 """
 from lang_main.analysis import preprocessing as ppc
 from lang_main.analysis import shared
 def test_load_data(raw_data_path, raw_data_date_cols):
    (data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
    assert len(data) == 1000
 def test_remove_simple_duplicates(raw_data_path, raw_data_date_cols):
    (data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
    (data,) = ppc.remove_duplicates(data)
    assert len(data) == 999
 def test_remove_na(raw_data_path, raw_data_date_cols):
    (data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
    (data,) = ppc.remove_duplicates(data)
    target_features: tuple[str] = ('VorgangsBeschreibung',)
    (data,) = ppc.remove_NA(data, target_features)
    assert len(data) == 998
 # def test_string_cleansing():
 #     string = 'Ölleckage   durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!'
 #     cleaned_string = shared.clean_string_slim(string)
 #     target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'
 #     assert cleaned_string == target_string
 def test_entry_wise_cleansing(raw_data_path, raw_data_date_cols):
    (data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
    (data,) = ppc.remove_duplicates(data)
    target_features: tuple[str] = ('VorgangsBeschreibung',)
    (data,) = ppc.remove_NA(data, target_features)
    starting_string = 'Ölleckage   durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!'
    cleaned_string = shared.clean_string_slim(starting_string)
    target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'
    assert cleaned_string == target_string
    starting_string = 'Ölleckage   durch\nundichten    Ölsumpf,, aber Dichtung intakt??!!!'
    assert data.at[0, 'VorgangsBeschreibung'] == starting_string
    (data,) = shared.entry_wise_cleansing(
        data,
        target_features=target_features,
        cleansing_func=shared.clean_string_slim,
    )
    assert data.at[0, 'VorgangsBeschreibung'] == target_string
 def test_analyse_feature(raw_data_path, raw_data_date_cols):
    (data,) = ppc.load_raw_data(raw_data_path, raw_data_date_cols)
    (data,) = ppc.remove_duplicates(data)
    target_features: tuple[str] = ('VorgangsBeschreibung',)
    (data,) = ppc.remove_NA(data, target_features)
    starting_string = 'Ölleckage   durch\nundichten \t Ölsumpf,, aber Dichtung intakt??!!!'
    cleaned_string = shared.clean_string_slim(starting_string)
    target_string = 'Ölleckage durch undichten Ölsumpf, aber Dichtung intakt!'
    assert cleaned_string == target_string
    starting_string = 'Ölleckage   durch\nundichten    Ölsumpf,, aber Dichtung intakt??!!!'
    assert data.at[0, 'VorgangsBeschreibung'] == starting_string
    (data,) = shared.entry_wise_cleansing(
        data,
        target_features=target_features,
        cleansing_func=shared.clean_string_slim,
    )
    assert data.at[0, 'VorgangsBeschreibung'] == target_string
    (data,) = ppc.analyse_feature(data, target_feature=target_features[0])
    assert len(data) == 139
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -0,0 +1,23 @@
 from pathlib import Path
 import pytest
 DATE_COLS: tuple[str, ...] = (
    'VorgangsDatum',
    'ErledigungsDatum',
    'Arbeitsbeginn',
    'ErstellungsDatum',
 )
@pytest.fixture(scope='session')
 def raw_data_path():
    pth_data = Path('./tests/Dummy_Dataset_N_1000.csv')
    assert pth_data.exists()
    return pth_data
@pytest.fixture(scope='session')
 def raw_data_date_cols():
    return DATE_COLS
--- a/tests/lang_main_config.toml
+++ b/tests/lang_main_config.toml
@ -1,56 +0,0 @@
 # lang_main: Config file
 [paths]
 inputs = '../scripts/inputs/'
 results = '../scripts/results/test_new2/'
 dataset = '../data/02_202307/Export4.csv'
 #results = './results/Export7/'
 #dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
 #results = './results/Export7_trunc/'
 #dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
 [control]
 preprocessing = true
 preprocessing_skip = false
 token_analysis = false
 token_analysis_skip = false
 graph_postprocessing = false
 graph_postprocessing_skip = false
 time_analysis = false
 time_analysis_skip = false
 #[export_filenames]
 #filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
 [preprocess]
 filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
 date_cols = [
    "VorgangsDatum", 
    "ErledigungsDatum", 
    "Arbeitsbeginn", 
    "ErstellungsDatum",
 ]
 threshold_amount_characters = 5
 threshold_similarity = 0.8
 [graph_postprocessing]
 threshold_edge_weight = 150
 [time_analysis.uniqueness]
 threshold_unique_texts = 4
 criterion_feature = 'HObjektText'
 feature_name_obj_id = 'ObjektID'
 [time_analysis.model_input]
 input_features = [
    'VorgangsTypName',
    'VorgangsArtText',
    'VorgangsBeschreibung',
 ]
 activity_feature = 'VorgangsTypName'
 activity_types = [
    'Reparaturauftrag (Portal)',
    'Störungsmeldung',
 ]
 threshold_num_acitivities = 1
 threshold_similarity = 0.8
--- a/tests/pipelines/init.py
+++ b/tests/pipelines/init.py
--- a/tests/test_config.py
+++ b/tests/test_config.py
@ -0,0 +1,7 @@
 from lang_main import config, pkg_dir
 def test_load_config():
    toml_path = pkg_dir / 'lang_main_config.toml'
    loaded_cfg = config.load_toml_config(toml_path)
    assert loaded_cfg['info']['pkg'] == 'lang_main'
--- a/tests/test_io.py
+++ b/tests/test_io.py
@ -0,0 +1,57 @@
 import pytest
 from lang_main import io
 CONTENT = 'test_lang_main'
@pytest.mark.parametrize(
    'overwrite',
    [True, False],
 )
 def test_create_saving_folder(tmp_path, overwrite):
    target_dir = tmp_path / 'test'
    assert not target_dir.exists()
    io.create_saving_folder(target_dir, overwrite_existing=overwrite)
    assert target_dir.exists()
    assert target_dir.is_dir()
 def test_save_load(tmp_path):
    save_pth = tmp_path / 'test_lang_main.pkl'
    io.save_pickle(CONTENT, save_pth)
    loaded = io.load_pickle(save_pth)
    assert loaded == CONTENT
    b64_str = io.encode_to_base64_str(CONTENT)
    b64_str_file = io.encode_file_to_base64_str(save_pth)
    assert b64_str == b64_str_file
    b64_decoded = io.decode_from_base64_str(b64_str)
    assert b64_decoded == CONTENT
    b64_decoded_file = io.decode_from_base64_str(b64_str_file)
    assert b64_decoded_file == CONTENT
 def test_get_entry_point(tmp_path):
    save_pth = tmp_path / 'test_lang_main.pkl'
    io.save_pickle(CONTENT, save_pth)
    pth = io.get_entry_point(
        tmp_path,
        'test_lang_main',
        '.pkl',
        check_existence=True,
    )
    assert pth.exists()
    with pytest.raises(FileNotFoundError):
        _ = io.get_entry_point(
            tmp_path,
            'test_lang_main2',
            '.pkl',
            check_existence=True,
        )
    pth = io.get_entry_point(
        tmp_path,
        'test_lang_main2',
        '.pkl',
        check_existence=False,
    )
    assert not pth.exists()
--- a/tests/test_lang_main_init.py
+++ b/tests/test_lang_main_init.py
@ -0,0 +1,5 @@
 from lang_main import BASE_PATH
 def test_base_path():
    assert BASE_PATH is not None
--- a/tests/test_model_loader.py
+++ b/tests/test_model_loader.py
@ -0,0 +1,113 @@
 import pytest
 from sentence_transformers import SentenceTransformer
 from spacy.language import Language
 from lang_main import model_loader
 from lang_main.constants import (
    STFR_MODEL_ARGS_ONNX,
    SimilarityFunction,
    SpacyModelTypes,
    STFRBackends,
    STFRDeviceTypes,
    STFRModelTypes,
 )
 from lang_main.types import LanguageModels
@pytest.mark.parametrize(
    'similarity_func',
    [
        SimilarityFunction.COSINE,
        SimilarityFunction.DOT,
    ],
 )
@pytest.mark.parametrize(
    'model_name',
    [
        STFRModelTypes.ALL_DISTILROBERTA_V1,
        STFRModelTypes.ALL_MINI_LM_L12_V2,
        STFRModelTypes.ALL_MINI_LM_L6_V2,
        STFRModelTypes.ALL_MPNET_BASE_V2,
    ],
 )
@pytest.mark.mload
 def test_load_sentence_transformer(
    model_name,
    similarity_func,
 ) -> None:
    model = model_loader.load_sentence_transformer(
        model_name=model_name,
        similarity_func=similarity_func,
        backend=STFRBackends.TORCH,
        device=STFRDeviceTypes.CPU,
        model_kwargs=None,
    )
    assert isinstance(model, SentenceTransformer)
@pytest.mark.parametrize(
    'similarity_func',
    [
        SimilarityFunction.COSINE,
        SimilarityFunction.DOT,
    ],
 )
@pytest.mark.parametrize(
    'model_name',
    [
        STFRModelTypes.ALL_DISTILROBERTA_V1,
        STFRModelTypes.ALL_MINI_LM_L12_V2,
        STFRModelTypes.ALL_MINI_LM_L6_V2,
        STFRModelTypes.ALL_MPNET_BASE_V2,
    ],
 )
@pytest.mark.mload
 def test_load_sentence_transformer_onnx(
    model_name,
    similarity_func,
 ) -> None:
    model = model_loader.load_sentence_transformer(
        model_name=model_name,
        similarity_func=similarity_func,
        backend=STFRBackends.ONNX,
        device=STFRDeviceTypes.CPU,
        model_kwargs=STFR_MODEL_ARGS_ONNX,  # type: ignore
    )
    assert isinstance(model, SentenceTransformer)
@pytest.mark.parametrize(
    'model_name',
    [
        SpacyModelTypes.DE_CORE_NEWS_SM,
        SpacyModelTypes.DE_CORE_NEWS_MD,
        SpacyModelTypes.DE_CORE_NEWS_LG,
        SpacyModelTypes.DE_DEP_NEWS_TRF,
    ],
 )
@pytest.mark.mload
 def test_load_spacy_model(
    model_name,
 ):
    model = model_loader.load_spacy(
        model_name=model_name,
    )
    assert isinstance(model, Language)
@pytest.mark.mload
 def test_instantiate_spacy_model():
    model = model_loader.instantiate_model(
        model_load_map=model_loader.MODEL_LOADER_MAP,
        model=LanguageModels.SPACY,
    )
    assert isinstance(model, Language)
@pytest.mark.mload
 def test_instantiate_stfr_model():
    model = model_loader.instantiate_model(
        model_load_map=model_loader.MODEL_LOADER_MAP,
        model=LanguageModels.SENTENCE_TRANSFORMER,
    )
    assert isinstance(model, SentenceTransformer)
		`@ -0,0 +1 @@`
							`only used to simulate directory tree in final solution`