major changes in several aspects
This commit is contained in:
parent
27d40d5c99
commit
a0ca71ea87
8
lang-main.code-workspace
Normal file
8
lang-main.code-workspace
Normal file
@ -0,0 +1,8 @@
|
||||
{
|
||||
"folders": [
|
||||
{
|
||||
"path": "."
|
||||
}
|
||||
],
|
||||
"settings": {}
|
||||
}
|
||||
@ -1236,7 +1236,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
"version": "3.11.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@ -11,6 +11,251 @@
|
||||
"%autoreload 2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6e1c76cc-6f99-484a-b73c-c99d9d567bbd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "f34b343b-a7b6-4a6d-8db4-7f2f71addc54",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from importlib.util import find_spec"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "59c32d49-5bad-4f48-b91f-f069487ff543",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"importlib.util.find_spec('nunpy')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "dd3d9434-374d-42a4-9c08-230495d0f5a6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"_has_dash: bool = True if (find_spec('dash') and find_spec('kalido')) else False"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "4da22bea-d086-4097-86f6-37784b59e02b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"False"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"_has_dash"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "dfc51145-759e-43ee-a850-580149679c5b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"False"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"_has_py4cyto"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "dbdbc7c1-4dd6-49d6-b570-3fc7eb60938f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "36aa5c9c-c29a-4a05-8854-30b086a9240e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "b8afbf49-e61c-49ce-af53-694b01f90702",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"curr_path = Path.cwd()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "c353967a-2702-4370-b4f2-914ac3b950a3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list(curr_path.glob('./lang_main_consdfig.toml'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"id": "48ee9b11-60fa-43d4-87de-c9a2172a563b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pkg_dir = curr_path"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"id": "02d404e1-a352-44f6-a4e9-96b6aed0c3a3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"WindowsPath('A:/Arbeitsaufgaben/lang-main/notebooks')"
|
||||
]
|
||||
},
|
||||
"execution_count": 36,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pkg_dir"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"id": "fc9142c2-9e59-4113-811c-7a03068857f2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[WindowsPath('A:/Arbeitsaufgaben/lang-main/notebooks'),\n",
|
||||
" WindowsPath('A:/Arbeitsaufgaben/lang-main'),\n",
|
||||
" WindowsPath('A:/Arbeitsaufgaben'),\n",
|
||||
" WindowsPath('A:/')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"list((pkg_dir/'home.py').parents)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"id": "ac1312bf-332f-4008-8951-c53e35f37990",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cfg_found = False\n",
|
||||
"for it in range(len(pkg_dir.parents)):\n",
|
||||
" search_path = pkg_dir.parents[it]\n",
|
||||
" res = tuple(search_path.glob(f'lang_main*.toml'))\n",
|
||||
" if res:\n",
|
||||
" cfg_found = True\n",
|
||||
" target = res[0]\n",
|
||||
" break\n",
|
||||
" if search_path.name == 'python':\n",
|
||||
" break"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"id": "a9bb96cc-f8a7-4749-ae2b-62e363d18f54",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cfg_found"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"id": "58472aba-e887-4c50-857a-894c1c5c9003",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"WindowsPath('A:/Arbeitsaufgaben/lang-main/lang_main_config.toml')"
|
||||
]
|
||||
},
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"target"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
@ -11562,7 +11807,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
"version": "3.11.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@ -8,21 +8,31 @@ authors = [
|
||||
dependencies = [
|
||||
"pandas>=2.2.2",
|
||||
"networkx>=3.3",
|
||||
"spacy[lookups,transformers]>=3.7.4",
|
||||
"sentence-transformers>=2.7.0",
|
||||
"numpy<=1.26.4",
|
||||
"spacy>=3.7.4",
|
||||
"sentence-transformers[onnx]>=3.2.0",
|
||||
"numpy>=1.26.4",
|
||||
"pip>=24.0",
|
||||
"typing-extensions>=4.12.2",
|
||||
"plotly>=5.22.0",
|
||||
"dash>=2.17.0",
|
||||
"dash-cytoscape>=1.0.1",
|
||||
"py4cytoscape>=1.9.0",
|
||||
"kaleido==0.2.1",
|
||||
"tqdm>=4.67.0",
|
||||
"python-dateutil>=2.9.0.post0",
|
||||
]
|
||||
requires-python = ">=3.11"
|
||||
readme = "README.md"
|
||||
license = {text = "LicenseRef-Proprietary"}
|
||||
|
||||
[project.optional-dependencies]
|
||||
dash = [
|
||||
"dash-cytoscape>=1.0.2",
|
||||
"dash>=2.18.2",
|
||||
"kaleido==0.2.1",
|
||||
]
|
||||
plot = [
|
||||
"kaleido==0.2.1",
|
||||
"plotly>=5.24.1",
|
||||
]
|
||||
cytoscape = [
|
||||
"py4cytoscape>=1.11.0",
|
||||
]
|
||||
[build-system]
|
||||
requires = ["pdm-backend"]
|
||||
build-backend = "pdm.backend"
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
import time
|
||||
import webbrowser
|
||||
from collections.abc import Collection, Iterable
|
||||
from pathlib import Path
|
||||
from threading import Thread
|
||||
from typing import Any, Final, cast
|
||||
from pathlib import Path
|
||||
|
||||
# import dash_cytoscape as cyto
|
||||
import plotly.express as px
|
||||
|
||||
58
scripts/lang_main_config.old.toml
Normal file
58
scripts/lang_main_config.old.toml
Normal file
@ -0,0 +1,58 @@
|
||||
# lang_main: Config file
|
||||
|
||||
[paths]
|
||||
inputs = './inputs/'
|
||||
# results = './results/dummy_N_1000/'
|
||||
# dataset = '../data/Dummy_Dataset_N_1000.csv'
|
||||
results = './results/test_20240807/'
|
||||
dataset = '../data/02_202307/Export4.csv'
|
||||
|
||||
# only debugging features, production-ready pipelines should always
|
||||
# be fully executed
|
||||
[control]
|
||||
preprocessing_skip = true
|
||||
token_analysis_skip = true
|
||||
graph_postprocessing_skip = false
|
||||
graph_rescaling_skip = false
|
||||
graph_static_rendering_skip = true
|
||||
time_analysis_skip = true
|
||||
|
||||
[preprocess]
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.8
|
||||
|
||||
[graph_postprocessing]
|
||||
threshold_edge_number = 330
|
||||
# threshold_edge_weight = 150
|
||||
|
||||
[time_analysis.uniqueness]
|
||||
threshold_unique_texts = 4
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
|
||||
[time_analysis.preparation]
|
||||
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
|
||||
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
|
||||
|
||||
[time_analysis.model_input]
|
||||
# input_features = [
|
||||
# 'VorgangsTypName',
|
||||
# 'VorgangsArtText',
|
||||
# 'VorgangsBeschreibung',
|
||||
# ]
|
||||
input_features = [
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
activity_feature = 'VorgangsTypName'
|
||||
activity_types = [
|
||||
'Reparaturauftrag (Portal)',
|
||||
'Störungsmeldung',
|
||||
]
|
||||
threshold_num_acitivities = 1
|
||||
threshold_similarity = 0.8
|
||||
@ -7,14 +7,19 @@ inputs = './inputs/'
|
||||
results = './results/test_20240807/'
|
||||
dataset = '../data/02_202307/Export4.csv'
|
||||
|
||||
[logging]
|
||||
enabled = true
|
||||
stderr = true
|
||||
file = true
|
||||
|
||||
# only debugging features, production-ready pipelines should always
|
||||
# be fully executed
|
||||
[control]
|
||||
preprocessing_skip = true
|
||||
preprocessing_skip = false
|
||||
token_analysis_skip = true
|
||||
graph_postprocessing_skip = false
|
||||
graph_rescaling_skip = false
|
||||
graph_static_rendering_skip = false
|
||||
graph_postprocessing_skip = true
|
||||
graph_rescaling_skip = true
|
||||
graph_static_rendering_skip = true
|
||||
time_analysis_skip = true
|
||||
|
||||
[preprocess]
|
||||
|
||||
@ -1,56 +1,127 @@
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from time import gmtime
|
||||
from typing import Any, Final
|
||||
|
||||
import py4cytoscape as p4c
|
||||
_has_py4cyto: bool = True
|
||||
try:
|
||||
import py4cytoscape as p4c
|
||||
except ImportError:
|
||||
_has_py4cyto = False
|
||||
|
||||
from lang_main.io import load_toml_config
|
||||
|
||||
# ** py4cytoscape config
|
||||
p4c.set_summary_logger(False)
|
||||
p4c.py4cytoscape_logger.detail_logger.setLevel('ERROR')
|
||||
p4c.py4cytoscape_logger.detail_logger.removeHandler(p4c.py4cytoscape_logger.detail_handler)
|
||||
p4c.py4cytoscape_logger.detail_logger.addHandler(logging.NullHandler())
|
||||
if _has_py4cyto:
|
||||
p4c.set_summary_logger(False)
|
||||
p4c.py4cytoscape_logger.detail_logger.setLevel('ERROR')
|
||||
p4c.py4cytoscape_logger.detail_logger.removeHandler(
|
||||
p4c.py4cytoscape_logger.detail_handler
|
||||
)
|
||||
p4c.py4cytoscape_logger.detail_logger.addHandler(logging.NullHandler())
|
||||
|
||||
# ** lang-main config
|
||||
logging.Formatter.converter = gmtime
|
||||
LOG_FMT: Final[str] = '%(asctime)s | lang_main:%(module)s:%(levelname)s | %(message)s'
|
||||
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
|
||||
logging.basicConfig(
|
||||
stream=sys.stdout,
|
||||
format=LOG_FMT,
|
||||
datefmt=LOG_DATE_FMT,
|
||||
)
|
||||
|
||||
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
|
||||
CYTO_STYLESHEET_FILENAME: Final[str] = r'cytoscape_config/lang_main.xml'
|
||||
USE_INTERNAL_CONFIG: Final[bool] = False
|
||||
PREFER_INTERNAL_CONFIG: Final[bool] = False
|
||||
pkg_dir = Path(__file__).parent
|
||||
cfg_path_internal = (pkg_dir / CONFIG_FILENAME).resolve()
|
||||
cyto_stylesheet_path = (pkg_dir / CYTO_STYLESHEET_FILENAME).resolve()
|
||||
|
||||
# ** load config data: internal/external
|
||||
if USE_INTERNAL_CONFIG:
|
||||
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
|
||||
else:
|
||||
cfg_path_external = (Path.cwd() / CONFIG_FILENAME).resolve()
|
||||
if not cfg_path_external.exists():
|
||||
shutil.copy(cfg_path_internal, cfg_path_external)
|
||||
sys.exit(
|
||||
(
|
||||
'No config file was found. A new one with default values was created '
|
||||
'in the execution path. Please fill in the necessary values and '
|
||||
'restart the programm.'
|
||||
)
|
||||
)
|
||||
# raise NotImplementedError("External config data not implemented yet.")
|
||||
loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
|
||||
|
||||
CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
|
||||
# ** load config data: internal/external
|
||||
# look for external config first, if not found use internal one
|
||||
def search_cwd(
|
||||
glob_pattern: str = CONFIG_FILENAME,
|
||||
) -> Path | None:
|
||||
"""Searches the current working directory and looks for files
|
||||
matching the glob pattern.
|
||||
Returns the first match encountered.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
glob_pattern : str, optional
|
||||
pattern to look for, first match will be returned,
|
||||
by default CONFIG_FILENAME
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path | None
|
||||
Path if corresponding object was found, None otherwise
|
||||
"""
|
||||
cfg_path: Path | None = None
|
||||
res = tuple(Path.cwd().glob(glob_pattern))
|
||||
if res:
|
||||
cfg_path = res[0]
|
||||
|
||||
return cfg_path
|
||||
|
||||
|
||||
def search_iterative(
|
||||
starting_path: Path,
|
||||
glob_pattern: str = CONFIG_FILENAME,
|
||||
stop_folder_name: str | None = None,
|
||||
) -> Path | None:
|
||||
"""Iteratively searches the parent directories of the starting path
|
||||
and look for files matching the glob pattern. The starting path is not
|
||||
searched, only its parents. Therefore the starting path can also point
|
||||
to a file. The folder in which it is placed in will be searched.
|
||||
Returns the first match encountered.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
starting_path : Path
|
||||
non-inclusive starting path
|
||||
glob_pattern : str, optional
|
||||
pattern to look for, first match will be returned,
|
||||
by default CONFIG_FILENAME
|
||||
stop_folder_name : str, optional
|
||||
name of the last folder in the directory tree to search, by default 'python'
|
||||
|
||||
Returns
|
||||
-------
|
||||
Path | None
|
||||
Path if corresponding object was found, None otherwise
|
||||
"""
|
||||
cfg_path: Path | None = None
|
||||
for it in range(len(starting_path.parents)):
|
||||
search_path = starting_path.parents[it] # do not look in library folder
|
||||
res = tuple(search_path.glob(glob_pattern))
|
||||
if res:
|
||||
cfg_path = res[0]
|
||||
break
|
||||
|
||||
if stop_folder_name is not None and search_path.name == stop_folder_name:
|
||||
# library is placed inside a whole python installation for deployment
|
||||
# only look up to this folder
|
||||
break
|
||||
|
||||
return cfg_path
|
||||
|
||||
|
||||
def load_cfg() -> dict[str, Any]:
|
||||
cfg_path: Path | None
|
||||
if PREFER_INTERNAL_CONFIG:
|
||||
cfg_path = cfg_path_internal
|
||||
else:
|
||||
cfg_path = search_cwd(glob_pattern=CONFIG_FILENAME)
|
||||
|
||||
if cfg_path is None:
|
||||
cfg_path = search_iterative(
|
||||
starting_path=pkg_dir,
|
||||
glob_pattern=CONFIG_FILENAME,
|
||||
stop_folder_name='python',
|
||||
)
|
||||
# backup: use internal config
|
||||
if cfg_path is None:
|
||||
cfg_path = cfg_path_internal
|
||||
|
||||
config = load_toml_config(path_to_toml=cfg_path)
|
||||
|
||||
return config.copy()
|
||||
|
||||
|
||||
CONFIG: Final[dict[str, Any]] = load_cfg()
|
||||
|
||||
|
||||
# ** Cytoscape configuration
|
||||
# stylesheet
|
||||
@ -64,7 +135,7 @@ CYTO_PATH_STYLESHEET: Final[Path] = cyto_stylesheet_path
|
||||
|
||||
# TODO check removal
|
||||
# append Graphviz binary folder to system path if not already contained
|
||||
if sys.platform == 'win32':
|
||||
path = Path(r'C:\Program Files\Graphviz\bin')
|
||||
if path.is_dir() and str(path).lower() not in os.environ['PATH'].lower():
|
||||
os.environ['PATH'] += f';{path}'
|
||||
# if sys.platform == 'win32':
|
||||
# path = Path(r'C:\Program Files\Graphviz\bin')
|
||||
# if path.is_dir() and str(path).lower() not in os.environ['PATH'].lower():
|
||||
# os.environ['PATH'] += f';{path}'
|
||||
|
||||
@ -386,7 +386,7 @@ def pipe_rescale_graph_edge_weights(
|
||||
|
||||
|
||||
def normalise_array_linear(
|
||||
array: npt.NDArray[np.float_],
|
||||
array: npt.NDArray[np.float32],
|
||||
) -> npt.NDArray[np.float32]:
|
||||
"""apply standard linear normalisation
|
||||
|
||||
@ -445,7 +445,7 @@ def verify_property(
|
||||
graph: Graph | DiGraph,
|
||||
property: str,
|
||||
) -> None:
|
||||
for idx, (node_1, node_2) in enumerate(graph.edges):
|
||||
for node_1, node_2 in graph.edges:
|
||||
if property not in graph[node_1][node_2]:
|
||||
raise EdgePropertyNotContainedError(
|
||||
(
|
||||
|
||||
@ -1,8 +1,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Collection
|
||||
from itertools import combinations
|
||||
from math import factorial
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
from typing import TYPE_CHECKING, cast
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
@ -21,15 +23,10 @@ from lang_main.analysis.shared import (
|
||||
similar_index_groups,
|
||||
)
|
||||
from lang_main.loggers import logger_preprocess as logger
|
||||
from lang_main.pipelines.base import Pipeline
|
||||
from lang_main.types import Embedding, PandasIndex
|
||||
|
||||
# TODO removal
|
||||
# pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
|
||||
# pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
|
||||
# pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
|
||||
# pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
|
||||
# pattern_whitespace = re.compile(r'[ ]{2,}')
|
||||
if TYPE_CHECKING:
|
||||
from lang_main.pipelines.base import Pipeline
|
||||
|
||||
|
||||
# ** (1) dataset preparation: loading and simple preprocessing
|
||||
|
||||
@ -5,8 +5,9 @@ from typing import cast
|
||||
import networkx as nx
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
import sentence_transformers
|
||||
import sentence_transformers.util
|
||||
|
||||
# import sentence_transformers # TODO check removal
|
||||
# import sentence_transformers.util # TODO check removal
|
||||
from networkx import Graph
|
||||
from pandas import DataFrame, Series
|
||||
from sentence_transformers import SentenceTransformer
|
||||
@ -76,7 +77,6 @@ def candidates_by_index(
|
||||
data_model_input: Series,
|
||||
model: SentenceTransformer,
|
||||
cos_sim_threshold: float = 0.5,
|
||||
# ) -> Iterator[tuple[PandasIndex, PandasIndex]]:
|
||||
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
|
||||
"""function to filter candidate indices based on cosine similarity
|
||||
using SentenceTransformer model in batch mode,
|
||||
@ -111,7 +111,9 @@ def candidates_by_index(
|
||||
),
|
||||
)
|
||||
# cosine similarity
|
||||
cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
|
||||
cos_sim = cast(npt.NDArray, model.similarity(embds, embds).numpy())
|
||||
# TODO check removal
|
||||
# cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
|
||||
np.fill_diagonal(cos_sim, 0.0)
|
||||
cos_sim = np.triu(cos_sim)
|
||||
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
|
||||
|
||||
@ -11,6 +11,11 @@ from lang_main.analysis.graphs import (
|
||||
TokenGraph,
|
||||
update_graph,
|
||||
)
|
||||
from lang_main.constants import (
|
||||
POS_INDIRECT,
|
||||
POS_OF_INTEREST,
|
||||
TAG_OF_INTEREST,
|
||||
)
|
||||
from lang_main.loggers import logger_token_analysis as logger
|
||||
from lang_main.types import (
|
||||
PandasIndex,
|
||||
@ -19,24 +24,8 @@ from lang_main.types import (
|
||||
SpacyToken,
|
||||
)
|
||||
|
||||
# ** POS
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
|
||||
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV'])
|
||||
|
||||
# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
|
||||
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
|
||||
|
||||
# ** TAG
|
||||
# TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
|
||||
TAG_OF_INTEREST: frozenset[str] = frozenset()
|
||||
|
||||
|
||||
# ** obtaining connection in texts
|
||||
|
||||
|
||||
def pre_clean_word(string: str) -> str:
|
||||
pattern = r'[^A-Za-zäöüÄÖÜ]+'
|
||||
string = re.sub(pattern, '', string)
|
||||
@ -49,7 +38,6 @@ def is_str_date(
|
||||
string: str,
|
||||
fuzzy: bool = False,
|
||||
) -> bool:
|
||||
# print(string)
|
||||
try:
|
||||
# check if string is a number
|
||||
# if length is greater than 8, it is not a date
|
||||
|
||||
@ -1,6 +1,10 @@
|
||||
from enum import Enum # noqa: I001
|
||||
from importlib.util import find_spec
|
||||
from pathlib import Path
|
||||
from typing import Final
|
||||
|
||||
from sentence_transformers import SimilarityFunction
|
||||
|
||||
from lang_main import CONFIG, CYTO_PATH_STYLESHEET
|
||||
from lang_main import model_loader as m_load
|
||||
from lang_main.types import (
|
||||
@ -8,7 +12,12 @@ from lang_main.types import (
|
||||
CytoLayouts,
|
||||
LanguageModels,
|
||||
ModelLoaderMap,
|
||||
ONNXExecutionProvider, # noqa: F401
|
||||
STFRBackends,
|
||||
STFRDeviceTypes,
|
||||
STFRModelArgs,
|
||||
STFRModels,
|
||||
STFRQuantFilenames, # noqa: F401
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
@ -16,8 +25,23 @@ __all__ = [
|
||||
'CYTO_PATH_STYLESHEET',
|
||||
]
|
||||
|
||||
# ** dependencies
|
||||
_has_py4cyto: bool = True if find_spec('py4cytoscape') else False
|
||||
_has_dash: bool = True if (find_spec('dash') and find_spec('kaleido')) else False
|
||||
_has_plotly: bool = True if (find_spec('plotly') and find_spec('kaleido')) else False
|
||||
|
||||
|
||||
class Dependencies(Enum):
|
||||
PY4C = _has_py4cyto
|
||||
DASH = _has_dash
|
||||
PLOT = _has_plotly
|
||||
|
||||
|
||||
# ** logging
|
||||
# graphs
|
||||
ENABLE_LOGGING: Final[bool] = CONFIG['logging']['enabled']
|
||||
LOGGING_TO_FILE: Final[bool] = CONFIG['logging']['file']
|
||||
LOGGING_TO_STDERR: Final[bool] = CONFIG['logging']['stderr']
|
||||
LOGGING_DEFAULT_GRAPHS: Final[bool] = False
|
||||
|
||||
# ** paths
|
||||
@ -44,14 +68,25 @@ SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
|
||||
# ** models
|
||||
# ** loading
|
||||
SPACY_MODEL_NAME: Final[str] = 'de_dep_news_trf'
|
||||
STFR_MODEL_NAME: Final[str] = 'sentence-transformers/all-mpnet-base-v2'
|
||||
STFR_MODEL_NAME: Final[STFRModels] = STFRModels.ALL_MPNET_BASE_V2
|
||||
STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU
|
||||
STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE
|
||||
STFR_BACKEND: Final[STFRBackends] = STFRBackends.TORCH
|
||||
STFR_MODEL_ARGS: Final[STFRModelArgs] = {}
|
||||
# STFR_MODEL_ARGS: Final[STFRModelArgs] = {
|
||||
# 'file_name': STFRQuantFilenames.ONNX_Q_UINT8,
|
||||
# 'provider': ONNXExecutionProvider.CPU,
|
||||
# 'export': False,
|
||||
# }
|
||||
MODEL_LOADER_MAP: Final[ModelLoaderMap] = {
|
||||
LanguageModels.SENTENCE_TRANSFORMER: {
|
||||
'func': m_load.load_sentence_transformer,
|
||||
'kwargs': {
|
||||
'model_name': STFR_MODEL_NAME,
|
||||
'similarity_func': STFR_SIMILARITY,
|
||||
'backend': STFR_BACKEND,
|
||||
'device': STFR_DEVICE,
|
||||
'model_kwargs': STFR_MODEL_ARGS,
|
||||
},
|
||||
},
|
||||
LanguageModels.SPACY: {
|
||||
@ -61,6 +96,19 @@ MODEL_LOADER_MAP: Final[ModelLoaderMap] = {
|
||||
},
|
||||
},
|
||||
}
|
||||
# ** language dependency analysis
|
||||
# ** POS
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
|
||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
|
||||
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV'])
|
||||
# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
|
||||
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
|
||||
# ** TAG
|
||||
# TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
|
||||
TAG_OF_INTEREST: frozenset[str] = frozenset()
|
||||
|
||||
|
||||
# ** export
|
||||
# ** preprocessing
|
||||
@ -74,7 +122,6 @@ THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity'
|
||||
# ** graph postprocessing
|
||||
EDGE_WEIGHT_DECIMALS: Final[int] = 4
|
||||
THRESHOLD_EDGE_NUMBER: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_number']
|
||||
# THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
|
||||
PROPERTY_NAME_DEGREE_WEIGHTED: Final[str] = 'degree_weighted'
|
||||
|
||||
# ** graph exports (Cytoscape)
|
||||
|
||||
@ -14,3 +14,25 @@ class EmptyEdgesError(EmptyGraphError):
|
||||
|
||||
class GraphRenderError(Exception):
|
||||
"""Error raised if a graph object can not be rendered"""
|
||||
|
||||
|
||||
class DependencyMissingError(Exception):
|
||||
"""Error raised if needed dependency could not be found"""
|
||||
|
||||
|
||||
# ** pipelines to perform given actions on dataset in a customisable manner
|
||||
|
||||
|
||||
class NoPerformableActionError(Exception):
|
||||
"""Error describing that no action is available in the current pipeline"""
|
||||
|
||||
|
||||
class WrongActionTypeError(Exception):
|
||||
"""Error raised if added action type is not supported by corresponding pipeline"""
|
||||
|
||||
|
||||
class OutputInPipelineContainerError(Exception):
|
||||
"""Error raised if an output was detected by one of the performed
|
||||
actions in a PipelineContainer. Each action in a PipelineContainer is itself a
|
||||
procedure which does not have any parameters or return values and should therefore not
|
||||
return any values."""
|
||||
|
||||
@ -7,6 +7,11 @@ inputs = './inputs/'
|
||||
results = './results/test_20240807/'
|
||||
dataset = '../data/02_202307/Export4.csv'
|
||||
|
||||
[logging]
|
||||
enabled = true
|
||||
stderr = true
|
||||
file = true
|
||||
|
||||
# only debugging features, production-ready pipelines should always
|
||||
# be fully executed
|
||||
[control]
|
||||
|
||||
@ -1,16 +1,65 @@
|
||||
import logging
|
||||
import logging.handlers
|
||||
from pathlib import Path
|
||||
from time import gmtime
|
||||
from typing import Final
|
||||
|
||||
from lang_main.constants import (
|
||||
ENABLE_LOGGING,
|
||||
LOGGING_TO_FILE,
|
||||
LOGGING_TO_STDERR,
|
||||
)
|
||||
from lang_main.types import LoggingLevels
|
||||
|
||||
# ** logging
|
||||
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.INFO
|
||||
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.INFO
|
||||
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.INFO
|
||||
# ** config
|
||||
logging.Formatter.converter = gmtime
|
||||
LOG_FMT: Final[str] = '%(asctime)s | lang_main:%(module)s:%(levelname)s | %(message)s'
|
||||
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
|
||||
LOG_FILE_PATH: Final[Path] = Path.cwd() / 'lang-main.log'
|
||||
# logging.basicConfig(
|
||||
# format=LOG_FMT,
|
||||
# datefmt=LOG_DATE_FMT,
|
||||
# )
|
||||
|
||||
# ** formatters
|
||||
logger_all_formater = logging.Formatter(fmt=LOG_FMT, datefmt=LOG_DATE_FMT)
|
||||
|
||||
# ** handlers
|
||||
null_handler = logging.NullHandler()
|
||||
if ENABLE_LOGGING and LOGGING_TO_STDERR:
|
||||
logger_all_handler_stderr = logging.StreamHandler()
|
||||
logger_all_handler_stderr.setLevel(LoggingLevels.WARNING)
|
||||
logger_all_handler_stderr.setFormatter(logger_all_formater)
|
||||
else:
|
||||
logger_all_handler_stderr = null_handler
|
||||
|
||||
if ENABLE_LOGGING and LOGGING_TO_FILE:
|
||||
logger_all_handler_file = logging.handlers.RotatingFileHandler(
|
||||
LOG_FILE_PATH,
|
||||
encoding='utf-8',
|
||||
maxBytes=5_242_880,
|
||||
backupCount=1,
|
||||
)
|
||||
logger_all_handler_file.setLevel(LoggingLevels.DEBUG)
|
||||
logger_all_handler_file.setFormatter(logger_all_formater)
|
||||
else:
|
||||
logger_all_handler_file = null_handler
|
||||
|
||||
|
||||
# ** logging levels
|
||||
LOGGING_LEVEL_ALL: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||
LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.INFO
|
||||
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.INFO
|
||||
LOGGING_LEVEL_RENDERING: Final[LoggingLevels] = LoggingLevels.INFO
|
||||
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||
LOGGING_LEVEL_RENDERING: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||
|
||||
# ** loggers and configuration
|
||||
logger_all = logging.getLogger('lang_main')
|
||||
logger_all.addHandler(logger_all_handler_stderr)
|
||||
logger_all.addHandler(logger_all_handler_file)
|
||||
|
||||
logger_shared_helpers = logging.getLogger('lang_main.shared')
|
||||
logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)
|
||||
|
||||
@ -1,18 +1,28 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Literal, overload
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Literal,
|
||||
overload,
|
||||
)
|
||||
|
||||
import spacy
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
from lang_main.constants import STFR_SIMILARITY
|
||||
from lang_main.types import (
|
||||
LanguageModels,
|
||||
Model,
|
||||
ModelLoaderMap,
|
||||
SpacyModel,
|
||||
STFRBackends,
|
||||
STFRDeviceTypes,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from sentence_transformers import SimilarityFunction
|
||||
|
||||
|
||||
@overload
|
||||
def instantiate_model(
|
||||
@ -48,6 +58,15 @@ def load_spacy(
|
||||
|
||||
def load_sentence_transformer(
|
||||
model_name: str,
|
||||
device: STFRDeviceTypes,
|
||||
similarity_func: SimilarityFunction = STFR_SIMILARITY,
|
||||
backend: STFRBackends = STFRBackends.TORCH,
|
||||
device: STFRDeviceTypes = STFRDeviceTypes.CPU,
|
||||
model_kwargs: dict[str, Any] | None = None,
|
||||
) -> SentenceTransformer:
|
||||
return SentenceTransformer(model_name_or_path=model_name, device=device)
|
||||
return SentenceTransformer(
|
||||
model_name_or_path=model_name,
|
||||
similarity_fn_name=similarity_func,
|
||||
backend=backend, # type: ignore Literal matches Enum
|
||||
device=device,
|
||||
model_kwargs=model_kwargs,
|
||||
)
|
||||
|
||||
@ -6,27 +6,15 @@ from pathlib import Path
|
||||
from typing import Any, Never, cast
|
||||
from typing_extensions import override
|
||||
|
||||
from lang_main.errors import (
|
||||
NoPerformableActionError,
|
||||
OutputInPipelineContainerError,
|
||||
WrongActionTypeError,
|
||||
)
|
||||
from lang_main.io import load_pickle, save_pickle
|
||||
from lang_main.loggers import logger_pipelines as logger
|
||||
from lang_main.types import ResultHandling
|
||||
|
||||
# ** pipelines to perform given actions on dataset in a customisable manner
|
||||
|
||||
|
||||
class NoPerformableActionError(Exception):
|
||||
"""Error describing that no action is available in the current pipeline"""
|
||||
|
||||
|
||||
class WrongActionTypeError(Exception):
|
||||
"""Error raised if added action type is not supported by corresponding pipeline"""
|
||||
|
||||
|
||||
class OutputInPipelineContainerError(Exception):
|
||||
"""Error raised if an output was detected by one of the performed
|
||||
actions in a PipelineContainer. Each action in a PipelineContainer is itself a
|
||||
procedure which does not have any parameters or return values and should therefore not
|
||||
return any values."""
|
||||
|
||||
|
||||
class BasePipeline(ABC):
|
||||
def __init__(
|
||||
|
||||
@ -42,7 +42,6 @@ from lang_main.constants import (
|
||||
UNIQUE_CRITERION_FEATURE,
|
||||
)
|
||||
from lang_main.pipelines.base import Pipeline
|
||||
from lang_main.render import cytoscape as cyto
|
||||
from lang_main.types import EntryPoints, LanguageModels
|
||||
|
||||
# ** Models
|
||||
@ -137,13 +136,6 @@ def build_tk_graph_post_pipe() -> Pipeline:
|
||||
pipe_graph_postprocessing = Pipeline(
|
||||
name='Graph_Postprocessing', working_dir=SAVE_PATH_FOLDER
|
||||
)
|
||||
# pipe_graph_postprocessing.add(
|
||||
# graphs.filter_graph_by_edge_weight,
|
||||
# {
|
||||
# 'bound_lower': THRESHOLD_EDGE_WEIGHT,
|
||||
# 'bound_upper': None,
|
||||
# },
|
||||
# )
|
||||
pipe_graph_postprocessing.add(
|
||||
graphs.filter_graph_by_number_edges,
|
||||
{
|
||||
@ -190,6 +182,10 @@ def build_tk_graph_render_pipe(
|
||||
export_folder: Path = SAVE_PATH_FOLDER,
|
||||
base_network_name: str = CYTO_BASE_NETWORK_NAME,
|
||||
) -> Pipeline:
|
||||
# optional dependency: late import
|
||||
# raises exception if necessary modules are not found
|
||||
from lang_main.render import cytoscape as cyto
|
||||
|
||||
pipe_graph_rendering = Pipeline(
|
||||
name='Graph_Static-Rendering',
|
||||
working_dir=SAVE_PATH_FOLDER,
|
||||
|
||||
@ -0,0 +1,7 @@
|
||||
from lang_main.constants import Dependencies
|
||||
from lang_main.errors import DependencyMissingError
|
||||
|
||||
if not Dependencies.PY4C.value:
|
||||
raise DependencyMissingError(
|
||||
'The module >>render<< needs the package >>Py4Cytoscape<<. Package not found.'
|
||||
)
|
||||
@ -3,6 +3,7 @@ from collections.abc import Callable, Hashable
|
||||
from typing import (
|
||||
Any,
|
||||
Literal,
|
||||
NotRequired,
|
||||
Required,
|
||||
TypeAlias,
|
||||
TypedDict,
|
||||
@ -40,6 +41,27 @@ class LanguageModels(enum.StrEnum):
|
||||
SPACY = enum.auto()
|
||||
|
||||
|
||||
class ONNXExecutionProvider(enum.StrEnum):
|
||||
CPU = 'CPUExecutionProvider'
|
||||
|
||||
|
||||
class STFRModels(enum.StrEnum):
|
||||
ALL_MPNET_BASE_V2 = 'all-mpnet-base-v2'
|
||||
ALL_DISTILROBERTA_V1 = 'all-distilroberta-v1'
|
||||
ALL_MINI_LM_L12_V2 = 'all-MiniLM-L12-v2'
|
||||
ALL_MINI_LM_L6_V2 = 'all-MiniLM-L6-v2'
|
||||
|
||||
|
||||
class STFRQuantFilenames(enum.StrEnum):
|
||||
ONNX_Q_UINT8 = 'onnx/model_quint8_avx2.onnx'
|
||||
|
||||
|
||||
class STFRModelArgs(TypedDict):
|
||||
provider: NotRequired[ONNXExecutionProvider]
|
||||
file_name: NotRequired[STFRQuantFilenames]
|
||||
export: NotRequired[bool]
|
||||
|
||||
|
||||
Model: TypeAlias = SentenceTransformer | SpacyModel
|
||||
ModelLoaderFunc: TypeAlias = Callable[..., Model]
|
||||
|
||||
@ -52,7 +74,12 @@ class ModelLoaderInfo(TypedDict):
|
||||
ModelLoaderMap: TypeAlias = dict[LanguageModels, ModelLoaderInfo]
|
||||
|
||||
|
||||
# ** devices
|
||||
class STFRBackends(enum.StrEnum):
|
||||
TORCH = enum.auto()
|
||||
ONNX = enum.auto()
|
||||
OPENVINO = enum.auto()
|
||||
|
||||
|
||||
class STFRDeviceTypes(enum.StrEnum):
|
||||
CPU = enum.auto()
|
||||
GPU = enum.auto()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user