major changes in several aspects

2024-11-07 17:30:33 +01:00 · 2024-11-07 17:30:33 +01:00 · a0ca71ea87
commit a0ca71ea87
parent 27d40d5c99
22 changed files with 1628 additions and 479 deletions
--- a/lang-main.code-workspace
+++ b/lang-main.code-workspace
@ -0,0 +1,8 @@
+{
+	"folders": [
+		{
+			"path": "."
+		}
+	],
+	"settings": {}
+}
--- a/notebooks/dummy_data_generation.ipynb
+++ b/notebooks/dummy_data_generation.ipynb
@ -1236,7 +1236,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.10"
  }
 },
 "nbformat": 4,
--- a/notebooks/misc.ipynb
+++ b/notebooks/misc.ipynb
@ -11,6 +11,251 @@
    "%autoreload 2"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6e1c76cc-6f99-484a-b73c-c99d9d567bbd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "f34b343b-a7b6-4a6d-8db4-7f2f71addc54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from importlib.util import find_spec"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "59c32d49-5bad-4f48-b91f-f069487ff543",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "importlib.util.find_spec('nunpy')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "dd3d9434-374d-42a4-9c08-230495d0f5a6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_has_dash: bool = True if (find_spec('dash') and find_spec('kalido')) else False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "4da22bea-d086-4097-86f6-37784b59e02b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "_has_dash"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "dfc51145-759e-43ee-a850-580149679c5b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "_has_py4cyto"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbdbc7c1-4dd6-49d6-b570-3fc7eb60938f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "36aa5c9c-c29a-4a05-8854-30b086a9240e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "b8afbf49-e61c-49ce-af53-694b01f90702",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "curr_path = Path.cwd()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "c353967a-2702-4370-b4f2-914ac3b950a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(curr_path.glob('./lang_main_consdfig.toml'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "48ee9b11-60fa-43d4-87de-c9a2172a563b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pkg_dir = curr_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "02d404e1-a352-44f6-a4e9-96b6aed0c3a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "WindowsPath('A:/Arbeitsaufgaben/lang-main/notebooks')"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pkg_dir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "fc9142c2-9e59-4113-811c-7a03068857f2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[WindowsPath('A:/Arbeitsaufgaben/lang-main/notebooks'),\n",
+       " WindowsPath('A:/Arbeitsaufgaben/lang-main'),\n",
+       " WindowsPath('A:/Arbeitsaufgaben'),\n",
+       " WindowsPath('A:/')]"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list((pkg_dir/'home.py').parents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "ac1312bf-332f-4008-8951-c53e35f37990",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cfg_found = False\n",
+    "for it in range(len(pkg_dir.parents)):\n",
+    "    search_path = pkg_dir.parents[it]\n",
+    "    res = tuple(search_path.glob(f'lang_main*.toml'))\n",
+    "    if res:\n",
+    "        cfg_found = True\n",
+    "        target = res[0]\n",
+    "        break\n",
+    "    if search_path.name == 'python':\n",
+    "        break"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "a9bb96cc-f8a7-4749-ae2b-62e363d18f54",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cfg_found"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "58472aba-e887-4c50-857a-894c1c5c9003",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "WindowsPath('A:/Arbeitsaufgaben/lang-main/lang_main_config.toml')"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "target"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 2,
@ -11562,7 +11807,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.10"
  }
 },
 "nbformat": 4,
--- a/pdm.lock
+++ b/pdm.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -8,21 +8,31 @@ authors = [
 dependencies = [
    "pandas>=2.2.2",
    "networkx>=3.3",
-    "spacy[lookups,transformers]>=3.7.4",
-    "sentence-transformers>=2.7.0",
-    "numpy<=1.26.4",
+    "spacy>=3.7.4",
+    "sentence-transformers[onnx]>=3.2.0",
+    "numpy>=1.26.4",
    "pip>=24.0",
    "typing-extensions>=4.12.2",
-    "plotly>=5.22.0",
-    "dash>=2.17.0",
-    "dash-cytoscape>=1.0.1",
-    "py4cytoscape>=1.9.0",
-    "kaleido==0.2.1",
+    "tqdm>=4.67.0",
+    "python-dateutil>=2.9.0.post0",
 ]
 requires-python = ">=3.11"
 readme = "README.md"
 license = {text = "LicenseRef-Proprietary"}

+[project.optional-dependencies]
+dash = [
+    "dash-cytoscape>=1.0.2",
+    "dash>=2.18.2",
+    "kaleido==0.2.1",
+]
+plot = [
+    "kaleido==0.2.1",
+    "plotly>=5.24.1",
+]
+cytoscape = [
+    "py4cytoscape>=1.11.0",
+]
 [build-system]
 requires = ["pdm-backend"]
 build-backend = "pdm.backend"
--- a/scripts/dash_timeline_static.py
+++ b/scripts/dash_timeline_static.py
@ -1,9 +1,9 @@
 import time
 import webbrowser
 from collections.abc import Collection, Iterable
+from pathlib import Path
 from threading import Thread
 from typing import Any, Final, cast
-from pathlib import Path

 # import dash_cytoscape as cyto
 import plotly.express as px
--- a/scripts/lang_main_config.old.toml
+++ b/scripts/lang_main_config.old.toml
@ -0,0 +1,58 @@
+# lang_main: Config file
+
+[paths]
+inputs = './inputs/'
+# results = './results/dummy_N_1000/'
+# dataset = '../data/Dummy_Dataset_N_1000.csv'
+results = './results/test_20240807/'
+dataset = '../data/02_202307/Export4.csv'
+
+# only debugging features, production-ready pipelines should always
+# be fully executed
+[control]
+preprocessing_skip = true
+token_analysis_skip = true
+graph_postprocessing_skip = false
+graph_rescaling_skip = false
+graph_static_rendering_skip = true
+time_analysis_skip = true
+
+[preprocess]
+date_cols = [
+    "VorgangsDatum",
+    "ErledigungsDatum",
+    "Arbeitsbeginn",
+    "ErstellungsDatum",
+]
+threshold_amount_characters = 5
+threshold_similarity = 0.8
+
+[graph_postprocessing]
+threshold_edge_number = 330
+# threshold_edge_weight = 150
+
+[time_analysis.uniqueness]
+threshold_unique_texts = 4
+criterion_feature = 'HObjektText'
+feature_name_obj_id = 'ObjektID'
+
+[time_analysis.preparation]
+name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
+name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
+
+[time_analysis.model_input]
+# input_features = [
+#     'VorgangsTypName',
+#     'VorgangsArtText',
+#     'VorgangsBeschreibung',
+# ]
+input_features = [
+    'VorgangsBeschreibung',
+]
+activity_feature = 'VorgangsTypName'
+activity_types = [
+    'Reparaturauftrag (Portal)',
+    'Störungsmeldung',
+]
+threshold_num_acitivities = 1
+threshold_similarity = 0.8
--- a/scripts/lang_main_config.toml
+++ b/scripts/lang_main_config.toml
@ -7,14 +7,19 @@ inputs = './inputs/'
 results = './results/test_20240807/'
 dataset = '../data/02_202307/Export4.csv'

+[logging]
+enabled = true
+stderr = true
+file = true
+
 # only debugging features, production-ready pipelines should always
 # be fully executed
 [control]
-preprocessing_skip = true
+preprocessing_skip = false
 token_analysis_skip = true
-graph_postprocessing_skip = false
-graph_rescaling_skip = false
-graph_static_rendering_skip = false
+graph_postprocessing_skip = true
+graph_rescaling_skip = true
+graph_static_rendering_skip = true
 time_analysis_skip = true

 [preprocess]
--- a/src/lang_main/init.py
+++ b/src/lang_main/init.py
@ -1,56 +1,127 @@
 import logging
-import os
-import shutil
-import sys
 from pathlib import Path
-from time import gmtime
 from typing import Any, Final

+_has_py4cyto: bool = True
+try:
    import py4cytoscape as p4c
+except ImportError:
+    _has_py4cyto = False

 from lang_main.io import load_toml_config

 # ** py4cytoscape config
+if _has_py4cyto:
    p4c.set_summary_logger(False)
    p4c.py4cytoscape_logger.detail_logger.setLevel('ERROR')
-p4c.py4cytoscape_logger.detail_logger.removeHandler(p4c.py4cytoscape_logger.detail_handler)
+    p4c.py4cytoscape_logger.detail_logger.removeHandler(
+        p4c.py4cytoscape_logger.detail_handler
+    )
    p4c.py4cytoscape_logger.detail_logger.addHandler(logging.NullHandler())

 # ** lang-main config
-logging.Formatter.converter = gmtime
-LOG_FMT: Final[str] = '%(asctime)s | lang_main:%(module)s:%(levelname)s | %(message)s'
-LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
-logging.basicConfig(
-    stream=sys.stdout,
-    format=LOG_FMT,
-    datefmt=LOG_DATE_FMT,
-)
-
 CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
 CYTO_STYLESHEET_FILENAME: Final[str] = r'cytoscape_config/lang_main.xml'
-USE_INTERNAL_CONFIG: Final[bool] = False
+PREFER_INTERNAL_CONFIG: Final[bool] = False
 pkg_dir = Path(__file__).parent
 cfg_path_internal = (pkg_dir / CONFIG_FILENAME).resolve()
 cyto_stylesheet_path = (pkg_dir / CYTO_STYLESHEET_FILENAME).resolve()

-# ** load config data: internal/external
-if USE_INTERNAL_CONFIG:
-    loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
-else:
-    cfg_path_external = (Path.cwd() / CONFIG_FILENAME).resolve()
-    if not cfg_path_external.exists():
-        shutil.copy(cfg_path_internal, cfg_path_external)
-        sys.exit(
-            (
-                'No config file was found. A new one with default values was created '
-                'in the execution path. Please fill in the necessary values and '
-                'restart the programm.'
-            )
-        )
-    # raise NotImplementedError("External config data not implemented yet.")
-    loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)

-CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
+# ** load config data: internal/external
+# look for external config first, if not found use internal one
+def search_cwd(
+    glob_pattern: str = CONFIG_FILENAME,
+) -> Path | None:
+    """Searches the current working directory and looks for files
+    matching the glob pattern.
+    Returns the first match encountered.
+
+    Parameters
+    ----------
+    glob_pattern : str, optional
+        pattern to look for, first match will be returned,
+        by default CONFIG_FILENAME
+
+    Returns
+    -------
+    Path | None
+        Path if corresponding object was found, None otherwise
+    """
+    cfg_path: Path | None = None
+    res = tuple(Path.cwd().glob(glob_pattern))
+    if res:
+        cfg_path = res[0]
+
+    return cfg_path
+
+
+def search_iterative(
+    starting_path: Path,
+    glob_pattern: str = CONFIG_FILENAME,
+    stop_folder_name: str | None = None,
+) -> Path | None:
+    """Iteratively searches the parent directories of the starting path
+    and look for files matching the glob pattern. The starting path is not
+    searched, only its parents. Therefore the starting path can also point
+    to a file. The folder in which it is placed in will be searched.
+    Returns the first match encountered.
+
+    Parameters
+    ----------
+    starting_path : Path
+        non-inclusive starting path
+    glob_pattern : str, optional
+        pattern to look for, first match will be returned,
+        by default CONFIG_FILENAME
+    stop_folder_name : str, optional
+        name of the last folder in the directory tree to search, by default 'python'
+
+    Returns
+    -------
+    Path | None
+        Path if corresponding object was found, None otherwise
+    """
+    cfg_path: Path | None = None
+    for it in range(len(starting_path.parents)):
+        search_path = starting_path.parents[it]  # do not look in library folder
+        res = tuple(search_path.glob(glob_pattern))
+        if res:
+            cfg_path = res[0]
+            break
+
+        if stop_folder_name is not None and search_path.name == stop_folder_name:
+            # library is placed inside a whole python installation for deployment
+            # only look up to this folder
+            break
+
+    return cfg_path
+
+
+def load_cfg() -> dict[str, Any]:
+    cfg_path: Path | None
+    if PREFER_INTERNAL_CONFIG:
+        cfg_path = cfg_path_internal
+    else:
+        cfg_path = search_cwd(glob_pattern=CONFIG_FILENAME)
+
+        if cfg_path is None:
+            cfg_path = search_iterative(
+                starting_path=pkg_dir,
+                glob_pattern=CONFIG_FILENAME,
+                stop_folder_name='python',
+            )
+        # backup: use internal config
+        if cfg_path is None:
+            cfg_path = cfg_path_internal
+
+    config = load_toml_config(path_to_toml=cfg_path)
+
+    return config.copy()
+
+
+CONFIG: Final[dict[str, Any]] = load_cfg()
+

 # ** Cytoscape configuration
 # stylesheet
@ -64,7 +135,7 @@ CYTO_PATH_STYLESHEET: Final[Path] = cyto_stylesheet_path

 # TODO check removal
 # append Graphviz binary folder to system path if not already contained
-if sys.platform == 'win32':
-    path = Path(r'C:\Program Files\Graphviz\bin')
-    if path.is_dir() and str(path).lower() not in os.environ['PATH'].lower():
-        os.environ['PATH'] += f';{path}'
+# if sys.platform == 'win32':
+#     path = Path(r'C:\Program Files\Graphviz\bin')
+#     if path.is_dir() and str(path).lower() not in os.environ['PATH'].lower():
+#         os.environ['PATH'] += f';{path}'
--- a/src/lang_main/analysis/graphs.py
+++ b/src/lang_main/analysis/graphs.py
@ -386,7 +386,7 @@ def pipe_rescale_graph_edge_weights(


 def normalise_array_linear(
-    array: npt.NDArray[np.float_],
+    array: npt.NDArray[np.float32],
 ) -> npt.NDArray[np.float32]:
    """apply standard linear normalisation

@ -445,7 +445,7 @@ def verify_property(
    graph: Graph | DiGraph,
    property: str,
 ) -> None:
-    for idx, (node_1, node_2) in enumerate(graph.edges):
+    for node_1, node_2 in graph.edges:
        if property not in graph[node_1][node_2]:
            raise EdgePropertyNotContainedError(
                (
--- a/src/lang_main/analysis/preprocessing.py
+++ b/src/lang_main/analysis/preprocessing.py
@ -1,8 +1,10 @@
+from __future__ import annotations
+
 from collections.abc import Collection
 from itertools import combinations
 from math import factorial
 from pathlib import Path
-from typing import cast
+from typing import TYPE_CHECKING, cast

 import numpy as np
 import pandas as pd
@ -21,15 +23,10 @@ from lang_main.analysis.shared import (
    similar_index_groups,
 )
 from lang_main.loggers import logger_preprocess as logger
-from lang_main.pipelines.base import Pipeline
 from lang_main.types import Embedding, PandasIndex

-# TODO removal
-# pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
-# pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
-# pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
-# pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
-# pattern_whitespace = re.compile(r'[ ]{2,}')
+if TYPE_CHECKING:
+    from lang_main.pipelines.base import Pipeline


 # ** (1) dataset preparation: loading and simple preprocessing
--- a/src/lang_main/analysis/shared.py
+++ b/src/lang_main/analysis/shared.py
@ -5,8 +5,9 @@ from typing import cast
 import networkx as nx
 import numpy as np
 import numpy.typing as npt
-import sentence_transformers
-import sentence_transformers.util
+
+# import sentence_transformers # TODO check removal
+# import sentence_transformers.util # TODO check removal
 from networkx import Graph
 from pandas import DataFrame, Series
 from sentence_transformers import SentenceTransformer
@ -76,7 +77,6 @@ def candidates_by_index(
    data_model_input: Series,
    model: SentenceTransformer,
    cos_sim_threshold: float = 0.5,
-    # ) -> Iterator[tuple[PandasIndex, PandasIndex]]:
 ) -> Iterator[tuple[PandasIndex, PandasIndex]]:
    """function to filter candidate indices based on cosine similarity
    using SentenceTransformer model in batch mode,
@ -111,7 +111,9 @@ def candidates_by_index(
        ),
    )
    # cosine similarity
-    cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
+    cos_sim = cast(npt.NDArray, model.similarity(embds, embds).numpy())
+    # TODO check removal
+    # cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
    np.fill_diagonal(cos_sim, 0.0)
    cos_sim = np.triu(cos_sim)
    cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
--- a/src/lang_main/analysis/tokens.py
+++ b/src/lang_main/analysis/tokens.py
@ -11,6 +11,11 @@ from lang_main.analysis.graphs import (
    TokenGraph,
    update_graph,
 )
+from lang_main.constants import (
+    POS_INDIRECT,
+    POS_OF_INTEREST,
+    TAG_OF_INTEREST,
+)
 from lang_main.loggers import logger_token_analysis as logger
 from lang_main.types import (
    PandasIndex,
@ -19,24 +24,8 @@ from lang_main.types import (
    SpacyToken,
 )

-# ** POS
-# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
-# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
-# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
-# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
-POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV'])
-
-# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
-POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
-
-# ** TAG
-# TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
-TAG_OF_INTEREST: frozenset[str] = frozenset()
-

 # ** obtaining connection in texts
-
-
 def pre_clean_word(string: str) -> str:
    pattern = r'[^A-Za-zäöüÄÖÜ]+'
    string = re.sub(pattern, '', string)
@ -49,7 +38,6 @@ def is_str_date(
    string: str,
    fuzzy: bool = False,
 ) -> bool:
-    # print(string)
    try:
        # check if string is a number
        # if length is greater than 8, it is not a date
--- a/src/lang_main/constants.py
+++ b/src/lang_main/constants.py
@ -1,6 +1,10 @@
+from enum import Enum  # noqa: I001
+from importlib.util import find_spec
 from pathlib import Path
 from typing import Final

+from sentence_transformers import SimilarityFunction
+
 from lang_main import CONFIG, CYTO_PATH_STYLESHEET
 from lang_main import model_loader as m_load
 from lang_main.types import (
@ -8,7 +12,12 @@ from lang_main.types import (
    CytoLayouts,
    LanguageModels,
    ModelLoaderMap,
+    ONNXExecutionProvider,  # noqa: F401
+    STFRBackends,
    STFRDeviceTypes,
+    STFRModelArgs,
+    STFRModels,
+    STFRQuantFilenames,  # noqa: F401
 )

 __all__ = [
@ -16,8 +25,23 @@ __all__ = [
    'CYTO_PATH_STYLESHEET',
 ]

+# ** dependencies
+_has_py4cyto: bool = True if find_spec('py4cytoscape') else False
+_has_dash: bool = True if (find_spec('dash') and find_spec('kaleido')) else False
+_has_plotly: bool = True if (find_spec('plotly') and find_spec('kaleido')) else False
+
+
+class Dependencies(Enum):
+    PY4C = _has_py4cyto
+    DASH = _has_dash
+    PLOT = _has_plotly
+
+
 # ** logging
 # graphs
+ENABLE_LOGGING: Final[bool] = CONFIG['logging']['enabled']
+LOGGING_TO_FILE: Final[bool] = CONFIG['logging']['file']
+LOGGING_TO_STDERR: Final[bool] = CONFIG['logging']['stderr']
 LOGGING_DEFAULT_GRAPHS: Final[bool] = False

 # ** paths
@ -44,14 +68,25 @@ SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
 # ** models
 # ** loading
 SPACY_MODEL_NAME: Final[str] = 'de_dep_news_trf'
-STFR_MODEL_NAME: Final[str] = 'sentence-transformers/all-mpnet-base-v2'
+STFR_MODEL_NAME: Final[STFRModels] = STFRModels.ALL_MPNET_BASE_V2
 STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU
+STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE
+STFR_BACKEND: Final[STFRBackends] = STFRBackends.TORCH
+STFR_MODEL_ARGS: Final[STFRModelArgs] = {}
+# STFR_MODEL_ARGS: Final[STFRModelArgs] = {
+#     'file_name': STFRQuantFilenames.ONNX_Q_UINT8,
+#     'provider': ONNXExecutionProvider.CPU,
+#     'export': False,
+# }
 MODEL_LOADER_MAP: Final[ModelLoaderMap] = {
    LanguageModels.SENTENCE_TRANSFORMER: {
        'func': m_load.load_sentence_transformer,
        'kwargs': {
            'model_name': STFR_MODEL_NAME,
+            'similarity_func': STFR_SIMILARITY,
+            'backend': STFR_BACKEND,
            'device': STFR_DEVICE,
+            'model_kwargs': STFR_MODEL_ARGS,
        },
    },
    LanguageModels.SPACY: {
@ -61,6 +96,19 @@ MODEL_LOADER_MAP: Final[ModelLoaderMap] = {
        },
    },
 }
+# ** language dependency analysis
+# ** POS
+# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
+# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
+# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
+# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
+POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV'])
+# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
+POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
+# ** TAG
+# TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
+TAG_OF_INTEREST: frozenset[str] = frozenset()
+

 # ** export
 # ** preprocessing
@ -74,7 +122,6 @@ THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity'
 # ** graph postprocessing
 EDGE_WEIGHT_DECIMALS: Final[int] = 4
 THRESHOLD_EDGE_NUMBER: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_number']
-# THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
 PROPERTY_NAME_DEGREE_WEIGHTED: Final[str] = 'degree_weighted'

 # ** graph exports (Cytoscape)
--- a/src/lang_main/errors.py
+++ b/src/lang_main/errors.py
@ -14,3 +14,25 @@ class EmptyEdgesError(EmptyGraphError):

 class GraphRenderError(Exception):
    """Error raised if a graph object can not be rendered"""
+
+
+class DependencyMissingError(Exception):
+    """Error raised if needed dependency could not be found"""
+
+
+# ** pipelines to perform given actions on dataset in a customisable manner
+
+
+class NoPerformableActionError(Exception):
+    """Error describing that no action is available in the current pipeline"""
+
+
+class WrongActionTypeError(Exception):
+    """Error raised if added action type is not supported by corresponding pipeline"""
+
+
+class OutputInPipelineContainerError(Exception):
+    """Error raised if an output was detected by one of the performed
+    actions in a PipelineContainer. Each action in a PipelineContainer is itself a
+    procedure which does not have any parameters or return values and should therefore not
+    return any values."""
--- a/src/lang_main/lang_main_config.toml
+++ b/src/lang_main/lang_main_config.toml
@ -7,6 +7,11 @@ inputs = './inputs/'
 results = './results/test_20240807/'
 dataset = '../data/02_202307/Export4.csv'

+[logging]
+enabled = true
+stderr = true
+file = true
+
 # only debugging features, production-ready pipelines should always
 # be fully executed
 [control]
--- a/src/lang_main/loggers.py
+++ b/src/lang_main/loggers.py
@ -1,16 +1,65 @@
 import logging
+import logging.handlers
+from pathlib import Path
+from time import gmtime
 from typing import Final

+from lang_main.constants import (
+    ENABLE_LOGGING,
+    LOGGING_TO_FILE,
+    LOGGING_TO_STDERR,
+)
 from lang_main.types import LoggingLevels

-# ** logging
-LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.INFO
-LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.INFO
-LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.INFO
+# ** config
+logging.Formatter.converter = gmtime
+LOG_FMT: Final[str] = '%(asctime)s | lang_main:%(module)s:%(levelname)s | %(message)s'
+LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
+LOG_FILE_PATH: Final[Path] = Path.cwd() / 'lang-main.log'
+# logging.basicConfig(
+#     format=LOG_FMT,
+#     datefmt=LOG_DATE_FMT,
+# )
+
+# ** formatters
+logger_all_formater = logging.Formatter(fmt=LOG_FMT, datefmt=LOG_DATE_FMT)
+
+# ** handlers
+null_handler = logging.NullHandler()
+if ENABLE_LOGGING and LOGGING_TO_STDERR:
+    logger_all_handler_stderr = logging.StreamHandler()
+    logger_all_handler_stderr.setLevel(LoggingLevels.WARNING)
+    logger_all_handler_stderr.setFormatter(logger_all_formater)
+else:
+    logger_all_handler_stderr = null_handler
+
+if ENABLE_LOGGING and LOGGING_TO_FILE:
+    logger_all_handler_file = logging.handlers.RotatingFileHandler(
+        LOG_FILE_PATH,
+        encoding='utf-8',
+        maxBytes=5_242_880,
+        backupCount=1,
+    )
+    logger_all_handler_file.setLevel(LoggingLevels.DEBUG)
+    logger_all_handler_file.setFormatter(logger_all_formater)
+else:
+    logger_all_handler_file = null_handler
+
+
+# ** logging levels
+LOGGING_LEVEL_ALL: Final[LoggingLevels] = LoggingLevels.DEBUG
+LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.DEBUG
+LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.DEBUG
+LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.DEBUG
 LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = LoggingLevels.DEBUG
-LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.INFO
-LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.INFO
-LOGGING_LEVEL_RENDERING: Final[LoggingLevels] = LoggingLevels.INFO
+LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.DEBUG
+LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.DEBUG
+LOGGING_LEVEL_RENDERING: Final[LoggingLevels] = LoggingLevels.DEBUG
+
+# ** loggers and configuration
+logger_all = logging.getLogger('lang_main')
+logger_all.addHandler(logger_all_handler_stderr)
+logger_all.addHandler(logger_all_handler_file)

 logger_shared_helpers = logging.getLogger('lang_main.shared')
 logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)
--- a/src/lang_main/model_loader.py
+++ b/src/lang_main/model_loader.py
@ -1,18 +1,28 @@
 from __future__ import annotations

-from typing import Literal, overload
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Literal,
+    overload,
+)

 import spacy
 from sentence_transformers import SentenceTransformer

+from lang_main.constants import STFR_SIMILARITY
 from lang_main.types import (
    LanguageModels,
    Model,
    ModelLoaderMap,
    SpacyModel,
+    STFRBackends,
    STFRDeviceTypes,
 )

+if TYPE_CHECKING:
+    from sentence_transformers import SimilarityFunction
+

@overload
 def instantiate_model(
@ -48,6 +58,15 @@ def load_spacy(

 def load_sentence_transformer(
    model_name: str,
-    device: STFRDeviceTypes,
+    similarity_func: SimilarityFunction = STFR_SIMILARITY,
+    backend: STFRBackends = STFRBackends.TORCH,
+    device: STFRDeviceTypes = STFRDeviceTypes.CPU,
+    model_kwargs: dict[str, Any] | None = None,
 ) -> SentenceTransformer:
-    return SentenceTransformer(model_name_or_path=model_name, device=device)
+    return SentenceTransformer(
+        model_name_or_path=model_name,
+        similarity_fn_name=similarity_func,
+        backend=backend,  # type: ignore Literal matches Enum
+        device=device,
+        model_kwargs=model_kwargs,
+    )
--- a/src/lang_main/pipelines/base.py
+++ b/src/lang_main/pipelines/base.py
@ -6,27 +6,15 @@ from pathlib import Path
 from typing import Any, Never, cast
 from typing_extensions import override

+from lang_main.errors import (
+    NoPerformableActionError,
+    OutputInPipelineContainerError,
+    WrongActionTypeError,
+)
 from lang_main.io import load_pickle, save_pickle
 from lang_main.loggers import logger_pipelines as logger
 from lang_main.types import ResultHandling

-# ** pipelines to perform given actions on dataset in a customisable manner
-
-
-class NoPerformableActionError(Exception):
-    """Error describing that no action is available in the current pipeline"""
-
-
-class WrongActionTypeError(Exception):
-    """Error raised if added action type is not supported by corresponding pipeline"""
-
-
-class OutputInPipelineContainerError(Exception):
-    """Error raised if an output was detected by one of the performed
-    actions in a PipelineContainer. Each action in a PipelineContainer is itself a
-    procedure which does not have any parameters or return values and should therefore not
-    return any values."""
-

 class BasePipeline(ABC):
    def __init__(
--- a/src/lang_main/pipelines/predefined.py
+++ b/src/lang_main/pipelines/predefined.py
@ -42,7 +42,6 @@ from lang_main.constants import (
    UNIQUE_CRITERION_FEATURE,
 )
 from lang_main.pipelines.base import Pipeline
-from lang_main.render import cytoscape as cyto
 from lang_main.types import EntryPoints, LanguageModels

 # ** Models
@ -137,13 +136,6 @@ def build_tk_graph_post_pipe() -> Pipeline:
    pipe_graph_postprocessing = Pipeline(
        name='Graph_Postprocessing', working_dir=SAVE_PATH_FOLDER
    )
-    # pipe_graph_postprocessing.add(
-    #     graphs.filter_graph_by_edge_weight,
-    #     {
-    #         'bound_lower': THRESHOLD_EDGE_WEIGHT,
-    #         'bound_upper': None,
-    #     },
-    # )
    pipe_graph_postprocessing.add(
        graphs.filter_graph_by_number_edges,
        {
@ -190,6 +182,10 @@ def build_tk_graph_render_pipe(
    export_folder: Path = SAVE_PATH_FOLDER,
    base_network_name: str = CYTO_BASE_NETWORK_NAME,
 ) -> Pipeline:
+    # optional dependency: late import
+    # raises exception if necessary modules are not found
+    from lang_main.render import cytoscape as cyto
+
    pipe_graph_rendering = Pipeline(
        name='Graph_Static-Rendering',
        working_dir=SAVE_PATH_FOLDER,
--- a/src/lang_main/render/init.py
+++ b/src/lang_main/render/init.py
@ -0,0 +1,7 @@
+from lang_main.constants import Dependencies
+from lang_main.errors import DependencyMissingError
+
+if not Dependencies.PY4C.value:
+    raise DependencyMissingError(
+        'The module >>render<< needs the package >>Py4Cytoscape<<. Package not found.'
+    )
--- a/src/lang_main/types.py
+++ b/src/lang_main/types.py
@ -3,6 +3,7 @@ from collections.abc import Callable, Hashable
 from typing import (
    Any,
    Literal,
+    NotRequired,
    Required,
    TypeAlias,
    TypedDict,
@ -40,6 +41,27 @@ class LanguageModels(enum.StrEnum):
    SPACY = enum.auto()


+class ONNXExecutionProvider(enum.StrEnum):
+    CPU = 'CPUExecutionProvider'
+
+
+class STFRModels(enum.StrEnum):
+    ALL_MPNET_BASE_V2 = 'all-mpnet-base-v2'
+    ALL_DISTILROBERTA_V1 = 'all-distilroberta-v1'
+    ALL_MINI_LM_L12_V2 = 'all-MiniLM-L12-v2'
+    ALL_MINI_LM_L6_V2 = 'all-MiniLM-L6-v2'
+
+
+class STFRQuantFilenames(enum.StrEnum):
+    ONNX_Q_UINT8 = 'onnx/model_quint8_avx2.onnx'
+
+
+class STFRModelArgs(TypedDict):
+    provider: NotRequired[ONNXExecutionProvider]
+    file_name: NotRequired[STFRQuantFilenames]
+    export: NotRequired[bool]
+
+
 Model: TypeAlias = SentenceTransformer | SpacyModel
 ModelLoaderFunc: TypeAlias = Callable[..., Model]

@ -52,7 +74,12 @@ class ModelLoaderInfo(TypedDict):
 ModelLoaderMap: TypeAlias = dict[LanguageModels, ModelLoaderInfo]


-# ** devices
+class STFRBackends(enum.StrEnum):
+    TORCH = enum.auto()
+    ONNX = enum.auto()
+    OPENVINO = enum.auto()
+
+
 class STFRDeviceTypes(enum.StrEnum):
    CPU = enum.auto()
    GPU = enum.auto()