major changes in several aspects
This commit is contained in:
parent
27d40d5c99
commit
a0ca71ea87
8
lang-main.code-workspace
Normal file
8
lang-main.code-workspace
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
{
|
||||||
|
"folders": [
|
||||||
|
{
|
||||||
|
"path": "."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"settings": {}
|
||||||
|
}
|
||||||
@ -1236,7 +1236,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.9"
|
"version": "3.11.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@ -11,6 +11,251 @@
|
|||||||
"%autoreload 2"
|
"%autoreload 2"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "6e1c76cc-6f99-484a-b73c-c99d9d567bbd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "f34b343b-a7b6-4a6d-8db4-7f2f71addc54",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from importlib.util import find_spec"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "59c32d49-5bad-4f48-b91f-f069487ff543",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"importlib.util.find_spec('nunpy')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"id": "dd3d9434-374d-42a4-9c08-230495d0f5a6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"_has_dash: bool = True if (find_spec('dash') and find_spec('kalido')) else False"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"id": "4da22bea-d086-4097-86f6-37784b59e02b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"False"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"_has_dash"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "dfc51145-759e-43ee-a850-580149679c5b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"False"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"_has_py4cyto"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "dbdbc7c1-4dd6-49d6-b570-3fc7eb60938f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"id": "36aa5c9c-c29a-4a05-8854-30b086a9240e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from pathlib import Path"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"id": "b8afbf49-e61c-49ce-af53-694b01f90702",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"curr_path = Path.cwd()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 28,
|
||||||
|
"id": "c353967a-2702-4370-b4f2-914ac3b950a3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 28,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"list(curr_path.glob('./lang_main_consdfig.toml'))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 27,
|
||||||
|
"id": "48ee9b11-60fa-43d4-87de-c9a2172a563b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pkg_dir = curr_path"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 36,
|
||||||
|
"id": "02d404e1-a352-44f6-a4e9-96b6aed0c3a3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"WindowsPath('A:/Arbeitsaufgaben/lang-main/notebooks')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 36,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"pkg_dir"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 44,
|
||||||
|
"id": "fc9142c2-9e59-4113-811c-7a03068857f2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[WindowsPath('A:/Arbeitsaufgaben/lang-main/notebooks'),\n",
|
||||||
|
" WindowsPath('A:/Arbeitsaufgaben/lang-main'),\n",
|
||||||
|
" WindowsPath('A:/Arbeitsaufgaben'),\n",
|
||||||
|
" WindowsPath('A:/')]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 44,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"list((pkg_dir/'home.py').parents)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 40,
|
||||||
|
"id": "ac1312bf-332f-4008-8951-c53e35f37990",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"cfg_found = False\n",
|
||||||
|
"for it in range(len(pkg_dir.parents)):\n",
|
||||||
|
" search_path = pkg_dir.parents[it]\n",
|
||||||
|
" res = tuple(search_path.glob(f'lang_main*.toml'))\n",
|
||||||
|
" if res:\n",
|
||||||
|
" cfg_found = True\n",
|
||||||
|
" target = res[0]\n",
|
||||||
|
" break\n",
|
||||||
|
" if search_path.name == 'python':\n",
|
||||||
|
" break"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 41,
|
||||||
|
"id": "a9bb96cc-f8a7-4749-ae2b-62e363d18f54",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 41,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"cfg_found"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 30,
|
||||||
|
"id": "58472aba-e887-4c50-857a-894c1c5c9003",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"WindowsPath('A:/Arbeitsaufgaben/lang-main/lang_main_config.toml')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 30,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"target"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 2,
|
||||||
@ -11562,7 +11807,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.9"
|
"version": "3.11.10"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@ -8,21 +8,31 @@ authors = [
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"pandas>=2.2.2",
|
"pandas>=2.2.2",
|
||||||
"networkx>=3.3",
|
"networkx>=3.3",
|
||||||
"spacy[lookups,transformers]>=3.7.4",
|
"spacy>=3.7.4",
|
||||||
"sentence-transformers>=2.7.0",
|
"sentence-transformers[onnx]>=3.2.0",
|
||||||
"numpy<=1.26.4",
|
"numpy>=1.26.4",
|
||||||
"pip>=24.0",
|
"pip>=24.0",
|
||||||
"typing-extensions>=4.12.2",
|
"typing-extensions>=4.12.2",
|
||||||
"plotly>=5.22.0",
|
"tqdm>=4.67.0",
|
||||||
"dash>=2.17.0",
|
"python-dateutil>=2.9.0.post0",
|
||||||
"dash-cytoscape>=1.0.1",
|
|
||||||
"py4cytoscape>=1.9.0",
|
|
||||||
"kaleido==0.2.1",
|
|
||||||
]
|
]
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
license = {text = "LicenseRef-Proprietary"}
|
license = {text = "LicenseRef-Proprietary"}
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
dash = [
|
||||||
|
"dash-cytoscape>=1.0.2",
|
||||||
|
"dash>=2.18.2",
|
||||||
|
"kaleido==0.2.1",
|
||||||
|
]
|
||||||
|
plot = [
|
||||||
|
"kaleido==0.2.1",
|
||||||
|
"plotly>=5.24.1",
|
||||||
|
]
|
||||||
|
cytoscape = [
|
||||||
|
"py4cytoscape>=1.11.0",
|
||||||
|
]
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["pdm-backend"]
|
requires = ["pdm-backend"]
|
||||||
build-backend = "pdm.backend"
|
build-backend = "pdm.backend"
|
||||||
|
|||||||
@ -1,9 +1,9 @@
|
|||||||
import time
|
import time
|
||||||
import webbrowser
|
import webbrowser
|
||||||
from collections.abc import Collection, Iterable
|
from collections.abc import Collection, Iterable
|
||||||
|
from pathlib import Path
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from typing import Any, Final, cast
|
from typing import Any, Final, cast
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# import dash_cytoscape as cyto
|
# import dash_cytoscape as cyto
|
||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
|
|||||||
58
scripts/lang_main_config.old.toml
Normal file
58
scripts/lang_main_config.old.toml
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
# lang_main: Config file
|
||||||
|
|
||||||
|
[paths]
|
||||||
|
inputs = './inputs/'
|
||||||
|
# results = './results/dummy_N_1000/'
|
||||||
|
# dataset = '../data/Dummy_Dataset_N_1000.csv'
|
||||||
|
results = './results/test_20240807/'
|
||||||
|
dataset = '../data/02_202307/Export4.csv'
|
||||||
|
|
||||||
|
# only debugging features, production-ready pipelines should always
|
||||||
|
# be fully executed
|
||||||
|
[control]
|
||||||
|
preprocessing_skip = true
|
||||||
|
token_analysis_skip = true
|
||||||
|
graph_postprocessing_skip = false
|
||||||
|
graph_rescaling_skip = false
|
||||||
|
graph_static_rendering_skip = true
|
||||||
|
time_analysis_skip = true
|
||||||
|
|
||||||
|
[preprocess]
|
||||||
|
date_cols = [
|
||||||
|
"VorgangsDatum",
|
||||||
|
"ErledigungsDatum",
|
||||||
|
"Arbeitsbeginn",
|
||||||
|
"ErstellungsDatum",
|
||||||
|
]
|
||||||
|
threshold_amount_characters = 5
|
||||||
|
threshold_similarity = 0.8
|
||||||
|
|
||||||
|
[graph_postprocessing]
|
||||||
|
threshold_edge_number = 330
|
||||||
|
# threshold_edge_weight = 150
|
||||||
|
|
||||||
|
[time_analysis.uniqueness]
|
||||||
|
threshold_unique_texts = 4
|
||||||
|
criterion_feature = 'HObjektText'
|
||||||
|
feature_name_obj_id = 'ObjektID'
|
||||||
|
|
||||||
|
[time_analysis.preparation]
|
||||||
|
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
|
||||||
|
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
|
||||||
|
|
||||||
|
[time_analysis.model_input]
|
||||||
|
# input_features = [
|
||||||
|
# 'VorgangsTypName',
|
||||||
|
# 'VorgangsArtText',
|
||||||
|
# 'VorgangsBeschreibung',
|
||||||
|
# ]
|
||||||
|
input_features = [
|
||||||
|
'VorgangsBeschreibung',
|
||||||
|
]
|
||||||
|
activity_feature = 'VorgangsTypName'
|
||||||
|
activity_types = [
|
||||||
|
'Reparaturauftrag (Portal)',
|
||||||
|
'Störungsmeldung',
|
||||||
|
]
|
||||||
|
threshold_num_acitivities = 1
|
||||||
|
threshold_similarity = 0.8
|
||||||
@ -7,14 +7,19 @@ inputs = './inputs/'
|
|||||||
results = './results/test_20240807/'
|
results = './results/test_20240807/'
|
||||||
dataset = '../data/02_202307/Export4.csv'
|
dataset = '../data/02_202307/Export4.csv'
|
||||||
|
|
||||||
|
[logging]
|
||||||
|
enabled = true
|
||||||
|
stderr = true
|
||||||
|
file = true
|
||||||
|
|
||||||
# only debugging features, production-ready pipelines should always
|
# only debugging features, production-ready pipelines should always
|
||||||
# be fully executed
|
# be fully executed
|
||||||
[control]
|
[control]
|
||||||
preprocessing_skip = true
|
preprocessing_skip = false
|
||||||
token_analysis_skip = true
|
token_analysis_skip = true
|
||||||
graph_postprocessing_skip = false
|
graph_postprocessing_skip = true
|
||||||
graph_rescaling_skip = false
|
graph_rescaling_skip = true
|
||||||
graph_static_rendering_skip = false
|
graph_static_rendering_skip = true
|
||||||
time_analysis_skip = true
|
time_analysis_skip = true
|
||||||
|
|
||||||
[preprocess]
|
[preprocess]
|
||||||
|
|||||||
@ -1,56 +1,127 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from time import gmtime
|
|
||||||
from typing import Any, Final
|
from typing import Any, Final
|
||||||
|
|
||||||
import py4cytoscape as p4c
|
_has_py4cyto: bool = True
|
||||||
|
try:
|
||||||
|
import py4cytoscape as p4c
|
||||||
|
except ImportError:
|
||||||
|
_has_py4cyto = False
|
||||||
|
|
||||||
from lang_main.io import load_toml_config
|
from lang_main.io import load_toml_config
|
||||||
|
|
||||||
# ** py4cytoscape config
|
# ** py4cytoscape config
|
||||||
p4c.set_summary_logger(False)
|
if _has_py4cyto:
|
||||||
p4c.py4cytoscape_logger.detail_logger.setLevel('ERROR')
|
p4c.set_summary_logger(False)
|
||||||
p4c.py4cytoscape_logger.detail_logger.removeHandler(p4c.py4cytoscape_logger.detail_handler)
|
p4c.py4cytoscape_logger.detail_logger.setLevel('ERROR')
|
||||||
p4c.py4cytoscape_logger.detail_logger.addHandler(logging.NullHandler())
|
p4c.py4cytoscape_logger.detail_logger.removeHandler(
|
||||||
|
p4c.py4cytoscape_logger.detail_handler
|
||||||
|
)
|
||||||
|
p4c.py4cytoscape_logger.detail_logger.addHandler(logging.NullHandler())
|
||||||
|
|
||||||
# ** lang-main config
|
# ** lang-main config
|
||||||
logging.Formatter.converter = gmtime
|
|
||||||
LOG_FMT: Final[str] = '%(asctime)s | lang_main:%(module)s:%(levelname)s | %(message)s'
|
|
||||||
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
|
|
||||||
logging.basicConfig(
|
|
||||||
stream=sys.stdout,
|
|
||||||
format=LOG_FMT,
|
|
||||||
datefmt=LOG_DATE_FMT,
|
|
||||||
)
|
|
||||||
|
|
||||||
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
|
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
|
||||||
CYTO_STYLESHEET_FILENAME: Final[str] = r'cytoscape_config/lang_main.xml'
|
CYTO_STYLESHEET_FILENAME: Final[str] = r'cytoscape_config/lang_main.xml'
|
||||||
USE_INTERNAL_CONFIG: Final[bool] = False
|
PREFER_INTERNAL_CONFIG: Final[bool] = False
|
||||||
pkg_dir = Path(__file__).parent
|
pkg_dir = Path(__file__).parent
|
||||||
cfg_path_internal = (pkg_dir / CONFIG_FILENAME).resolve()
|
cfg_path_internal = (pkg_dir / CONFIG_FILENAME).resolve()
|
||||||
cyto_stylesheet_path = (pkg_dir / CYTO_STYLESHEET_FILENAME).resolve()
|
cyto_stylesheet_path = (pkg_dir / CYTO_STYLESHEET_FILENAME).resolve()
|
||||||
|
|
||||||
# ** load config data: internal/external
|
|
||||||
if USE_INTERNAL_CONFIG:
|
|
||||||
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
|
|
||||||
else:
|
|
||||||
cfg_path_external = (Path.cwd() / CONFIG_FILENAME).resolve()
|
|
||||||
if not cfg_path_external.exists():
|
|
||||||
shutil.copy(cfg_path_internal, cfg_path_external)
|
|
||||||
sys.exit(
|
|
||||||
(
|
|
||||||
'No config file was found. A new one with default values was created '
|
|
||||||
'in the execution path. Please fill in the necessary values and '
|
|
||||||
'restart the programm.'
|
|
||||||
)
|
|
||||||
)
|
|
||||||
# raise NotImplementedError("External config data not implemented yet.")
|
|
||||||
loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
|
|
||||||
|
|
||||||
CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
|
# ** load config data: internal/external
|
||||||
|
# look for external config first, if not found use internal one
|
||||||
|
def search_cwd(
|
||||||
|
glob_pattern: str = CONFIG_FILENAME,
|
||||||
|
) -> Path | None:
|
||||||
|
"""Searches the current working directory and looks for files
|
||||||
|
matching the glob pattern.
|
||||||
|
Returns the first match encountered.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
glob_pattern : str, optional
|
||||||
|
pattern to look for, first match will be returned,
|
||||||
|
by default CONFIG_FILENAME
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path | None
|
||||||
|
Path if corresponding object was found, None otherwise
|
||||||
|
"""
|
||||||
|
cfg_path: Path | None = None
|
||||||
|
res = tuple(Path.cwd().glob(glob_pattern))
|
||||||
|
if res:
|
||||||
|
cfg_path = res[0]
|
||||||
|
|
||||||
|
return cfg_path
|
||||||
|
|
||||||
|
|
||||||
|
def search_iterative(
|
||||||
|
starting_path: Path,
|
||||||
|
glob_pattern: str = CONFIG_FILENAME,
|
||||||
|
stop_folder_name: str | None = None,
|
||||||
|
) -> Path | None:
|
||||||
|
"""Iteratively searches the parent directories of the starting path
|
||||||
|
and look for files matching the glob pattern. The starting path is not
|
||||||
|
searched, only its parents. Therefore the starting path can also point
|
||||||
|
to a file. The folder in which it is placed in will be searched.
|
||||||
|
Returns the first match encountered.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
starting_path : Path
|
||||||
|
non-inclusive starting path
|
||||||
|
glob_pattern : str, optional
|
||||||
|
pattern to look for, first match will be returned,
|
||||||
|
by default CONFIG_FILENAME
|
||||||
|
stop_folder_name : str, optional
|
||||||
|
name of the last folder in the directory tree to search, by default 'python'
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
Path | None
|
||||||
|
Path if corresponding object was found, None otherwise
|
||||||
|
"""
|
||||||
|
cfg_path: Path | None = None
|
||||||
|
for it in range(len(starting_path.parents)):
|
||||||
|
search_path = starting_path.parents[it] # do not look in library folder
|
||||||
|
res = tuple(search_path.glob(glob_pattern))
|
||||||
|
if res:
|
||||||
|
cfg_path = res[0]
|
||||||
|
break
|
||||||
|
|
||||||
|
if stop_folder_name is not None and search_path.name == stop_folder_name:
|
||||||
|
# library is placed inside a whole python installation for deployment
|
||||||
|
# only look up to this folder
|
||||||
|
break
|
||||||
|
|
||||||
|
return cfg_path
|
||||||
|
|
||||||
|
|
||||||
|
def load_cfg() -> dict[str, Any]:
|
||||||
|
cfg_path: Path | None
|
||||||
|
if PREFER_INTERNAL_CONFIG:
|
||||||
|
cfg_path = cfg_path_internal
|
||||||
|
else:
|
||||||
|
cfg_path = search_cwd(glob_pattern=CONFIG_FILENAME)
|
||||||
|
|
||||||
|
if cfg_path is None:
|
||||||
|
cfg_path = search_iterative(
|
||||||
|
starting_path=pkg_dir,
|
||||||
|
glob_pattern=CONFIG_FILENAME,
|
||||||
|
stop_folder_name='python',
|
||||||
|
)
|
||||||
|
# backup: use internal config
|
||||||
|
if cfg_path is None:
|
||||||
|
cfg_path = cfg_path_internal
|
||||||
|
|
||||||
|
config = load_toml_config(path_to_toml=cfg_path)
|
||||||
|
|
||||||
|
return config.copy()
|
||||||
|
|
||||||
|
|
||||||
|
CONFIG: Final[dict[str, Any]] = load_cfg()
|
||||||
|
|
||||||
|
|
||||||
# ** Cytoscape configuration
|
# ** Cytoscape configuration
|
||||||
# stylesheet
|
# stylesheet
|
||||||
@ -64,7 +135,7 @@ CYTO_PATH_STYLESHEET: Final[Path] = cyto_stylesheet_path
|
|||||||
|
|
||||||
# TODO check removal
|
# TODO check removal
|
||||||
# append Graphviz binary folder to system path if not already contained
|
# append Graphviz binary folder to system path if not already contained
|
||||||
if sys.platform == 'win32':
|
# if sys.platform == 'win32':
|
||||||
path = Path(r'C:\Program Files\Graphviz\bin')
|
# path = Path(r'C:\Program Files\Graphviz\bin')
|
||||||
if path.is_dir() and str(path).lower() not in os.environ['PATH'].lower():
|
# if path.is_dir() and str(path).lower() not in os.environ['PATH'].lower():
|
||||||
os.environ['PATH'] += f';{path}'
|
# os.environ['PATH'] += f';{path}'
|
||||||
|
|||||||
@ -386,7 +386,7 @@ def pipe_rescale_graph_edge_weights(
|
|||||||
|
|
||||||
|
|
||||||
def normalise_array_linear(
|
def normalise_array_linear(
|
||||||
array: npt.NDArray[np.float_],
|
array: npt.NDArray[np.float32],
|
||||||
) -> npt.NDArray[np.float32]:
|
) -> npt.NDArray[np.float32]:
|
||||||
"""apply standard linear normalisation
|
"""apply standard linear normalisation
|
||||||
|
|
||||||
@ -445,7 +445,7 @@ def verify_property(
|
|||||||
graph: Graph | DiGraph,
|
graph: Graph | DiGraph,
|
||||||
property: str,
|
property: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
for idx, (node_1, node_2) in enumerate(graph.edges):
|
for node_1, node_2 in graph.edges:
|
||||||
if property not in graph[node_1][node_2]:
|
if property not in graph[node_1][node_2]:
|
||||||
raise EdgePropertyNotContainedError(
|
raise EdgePropertyNotContainedError(
|
||||||
(
|
(
|
||||||
|
|||||||
@ -1,8 +1,10 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from collections.abc import Collection
|
from collections.abc import Collection
|
||||||
from itertools import combinations
|
from itertools import combinations
|
||||||
from math import factorial
|
from math import factorial
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import cast
|
from typing import TYPE_CHECKING, cast
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -21,15 +23,10 @@ from lang_main.analysis.shared import (
|
|||||||
similar_index_groups,
|
similar_index_groups,
|
||||||
)
|
)
|
||||||
from lang_main.loggers import logger_preprocess as logger
|
from lang_main.loggers import logger_preprocess as logger
|
||||||
from lang_main.pipelines.base import Pipeline
|
|
||||||
from lang_main.types import Embedding, PandasIndex
|
from lang_main.types import Embedding, PandasIndex
|
||||||
|
|
||||||
# TODO removal
|
if TYPE_CHECKING:
|
||||||
# pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
|
from lang_main.pipelines.base import Pipeline
|
||||||
# pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
|
|
||||||
# pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
|
|
||||||
# pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
|
|
||||||
# pattern_whitespace = re.compile(r'[ ]{2,}')
|
|
||||||
|
|
||||||
|
|
||||||
# ** (1) dataset preparation: loading and simple preprocessing
|
# ** (1) dataset preparation: loading and simple preprocessing
|
||||||
|
|||||||
@ -5,8 +5,9 @@ from typing import cast
|
|||||||
import networkx as nx
|
import networkx as nx
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
import sentence_transformers
|
|
||||||
import sentence_transformers.util
|
# import sentence_transformers # TODO check removal
|
||||||
|
# import sentence_transformers.util # TODO check removal
|
||||||
from networkx import Graph
|
from networkx import Graph
|
||||||
from pandas import DataFrame, Series
|
from pandas import DataFrame, Series
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
@ -76,7 +77,6 @@ def candidates_by_index(
|
|||||||
data_model_input: Series,
|
data_model_input: Series,
|
||||||
model: SentenceTransformer,
|
model: SentenceTransformer,
|
||||||
cos_sim_threshold: float = 0.5,
|
cos_sim_threshold: float = 0.5,
|
||||||
# ) -> Iterator[tuple[PandasIndex, PandasIndex]]:
|
|
||||||
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
|
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
|
||||||
"""function to filter candidate indices based on cosine similarity
|
"""function to filter candidate indices based on cosine similarity
|
||||||
using SentenceTransformer model in batch mode,
|
using SentenceTransformer model in batch mode,
|
||||||
@ -111,7 +111,9 @@ def candidates_by_index(
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
# cosine similarity
|
# cosine similarity
|
||||||
cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
|
cos_sim = cast(npt.NDArray, model.similarity(embds, embds).numpy())
|
||||||
|
# TODO check removal
|
||||||
|
# cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
|
||||||
np.fill_diagonal(cos_sim, 0.0)
|
np.fill_diagonal(cos_sim, 0.0)
|
||||||
cos_sim = np.triu(cos_sim)
|
cos_sim = np.triu(cos_sim)
|
||||||
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
|
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
|
||||||
|
|||||||
@ -11,6 +11,11 @@ from lang_main.analysis.graphs import (
|
|||||||
TokenGraph,
|
TokenGraph,
|
||||||
update_graph,
|
update_graph,
|
||||||
)
|
)
|
||||||
|
from lang_main.constants import (
|
||||||
|
POS_INDIRECT,
|
||||||
|
POS_OF_INTEREST,
|
||||||
|
TAG_OF_INTEREST,
|
||||||
|
)
|
||||||
from lang_main.loggers import logger_token_analysis as logger
|
from lang_main.loggers import logger_token_analysis as logger
|
||||||
from lang_main.types import (
|
from lang_main.types import (
|
||||||
PandasIndex,
|
PandasIndex,
|
||||||
@ -19,24 +24,8 @@ from lang_main.types import (
|
|||||||
SpacyToken,
|
SpacyToken,
|
||||||
)
|
)
|
||||||
|
|
||||||
# ** POS
|
|
||||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
|
||||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
|
|
||||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
|
|
||||||
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
|
|
||||||
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV'])
|
|
||||||
|
|
||||||
# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
|
|
||||||
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
|
|
||||||
|
|
||||||
# ** TAG
|
|
||||||
# TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
|
|
||||||
TAG_OF_INTEREST: frozenset[str] = frozenset()
|
|
||||||
|
|
||||||
|
|
||||||
# ** obtaining connection in texts
|
# ** obtaining connection in texts
|
||||||
|
|
||||||
|
|
||||||
def pre_clean_word(string: str) -> str:
|
def pre_clean_word(string: str) -> str:
|
||||||
pattern = r'[^A-Za-zäöüÄÖÜ]+'
|
pattern = r'[^A-Za-zäöüÄÖÜ]+'
|
||||||
string = re.sub(pattern, '', string)
|
string = re.sub(pattern, '', string)
|
||||||
@ -49,7 +38,6 @@ def is_str_date(
|
|||||||
string: str,
|
string: str,
|
||||||
fuzzy: bool = False,
|
fuzzy: bool = False,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
# print(string)
|
|
||||||
try:
|
try:
|
||||||
# check if string is a number
|
# check if string is a number
|
||||||
# if length is greater than 8, it is not a date
|
# if length is greater than 8, it is not a date
|
||||||
|
|||||||
@ -1,6 +1,10 @@
|
|||||||
|
from enum import Enum # noqa: I001
|
||||||
|
from importlib.util import find_spec
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Final
|
from typing import Final
|
||||||
|
|
||||||
|
from sentence_transformers import SimilarityFunction
|
||||||
|
|
||||||
from lang_main import CONFIG, CYTO_PATH_STYLESHEET
|
from lang_main import CONFIG, CYTO_PATH_STYLESHEET
|
||||||
from lang_main import model_loader as m_load
|
from lang_main import model_loader as m_load
|
||||||
from lang_main.types import (
|
from lang_main.types import (
|
||||||
@ -8,7 +12,12 @@ from lang_main.types import (
|
|||||||
CytoLayouts,
|
CytoLayouts,
|
||||||
LanguageModels,
|
LanguageModels,
|
||||||
ModelLoaderMap,
|
ModelLoaderMap,
|
||||||
|
ONNXExecutionProvider, # noqa: F401
|
||||||
|
STFRBackends,
|
||||||
STFRDeviceTypes,
|
STFRDeviceTypes,
|
||||||
|
STFRModelArgs,
|
||||||
|
STFRModels,
|
||||||
|
STFRQuantFilenames, # noqa: F401
|
||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@ -16,8 +25,23 @@ __all__ = [
|
|||||||
'CYTO_PATH_STYLESHEET',
|
'CYTO_PATH_STYLESHEET',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# ** dependencies
|
||||||
|
_has_py4cyto: bool = True if find_spec('py4cytoscape') else False
|
||||||
|
_has_dash: bool = True if (find_spec('dash') and find_spec('kaleido')) else False
|
||||||
|
_has_plotly: bool = True if (find_spec('plotly') and find_spec('kaleido')) else False
|
||||||
|
|
||||||
|
|
||||||
|
class Dependencies(Enum):
|
||||||
|
PY4C = _has_py4cyto
|
||||||
|
DASH = _has_dash
|
||||||
|
PLOT = _has_plotly
|
||||||
|
|
||||||
|
|
||||||
# ** logging
|
# ** logging
|
||||||
# graphs
|
# graphs
|
||||||
|
ENABLE_LOGGING: Final[bool] = CONFIG['logging']['enabled']
|
||||||
|
LOGGING_TO_FILE: Final[bool] = CONFIG['logging']['file']
|
||||||
|
LOGGING_TO_STDERR: Final[bool] = CONFIG['logging']['stderr']
|
||||||
LOGGING_DEFAULT_GRAPHS: Final[bool] = False
|
LOGGING_DEFAULT_GRAPHS: Final[bool] = False
|
||||||
|
|
||||||
# ** paths
|
# ** paths
|
||||||
@ -44,14 +68,25 @@ SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
|
|||||||
# ** models
|
# ** models
|
||||||
# ** loading
|
# ** loading
|
||||||
SPACY_MODEL_NAME: Final[str] = 'de_dep_news_trf'
|
SPACY_MODEL_NAME: Final[str] = 'de_dep_news_trf'
|
||||||
STFR_MODEL_NAME: Final[str] = 'sentence-transformers/all-mpnet-base-v2'
|
STFR_MODEL_NAME: Final[STFRModels] = STFRModels.ALL_MPNET_BASE_V2
|
||||||
STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU
|
STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU
|
||||||
|
STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE
|
||||||
|
STFR_BACKEND: Final[STFRBackends] = STFRBackends.TORCH
|
||||||
|
STFR_MODEL_ARGS: Final[STFRModelArgs] = {}
|
||||||
|
# STFR_MODEL_ARGS: Final[STFRModelArgs] = {
|
||||||
|
# 'file_name': STFRQuantFilenames.ONNX_Q_UINT8,
|
||||||
|
# 'provider': ONNXExecutionProvider.CPU,
|
||||||
|
# 'export': False,
|
||||||
|
# }
|
||||||
MODEL_LOADER_MAP: Final[ModelLoaderMap] = {
|
MODEL_LOADER_MAP: Final[ModelLoaderMap] = {
|
||||||
LanguageModels.SENTENCE_TRANSFORMER: {
|
LanguageModels.SENTENCE_TRANSFORMER: {
|
||||||
'func': m_load.load_sentence_transformer,
|
'func': m_load.load_sentence_transformer,
|
||||||
'kwargs': {
|
'kwargs': {
|
||||||
'model_name': STFR_MODEL_NAME,
|
'model_name': STFR_MODEL_NAME,
|
||||||
|
'similarity_func': STFR_SIMILARITY,
|
||||||
|
'backend': STFR_BACKEND,
|
||||||
'device': STFR_DEVICE,
|
'device': STFR_DEVICE,
|
||||||
|
'model_kwargs': STFR_MODEL_ARGS,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
LanguageModels.SPACY: {
|
LanguageModels.SPACY: {
|
||||||
@ -61,6 +96,19 @@ MODEL_LOADER_MAP: Final[ModelLoaderMap] = {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
# ** language dependency analysis
|
||||||
|
# ** POS
|
||||||
|
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
|
||||||
|
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
|
||||||
|
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
|
||||||
|
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
|
||||||
|
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV'])
|
||||||
|
# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
|
||||||
|
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
|
||||||
|
# ** TAG
|
||||||
|
# TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
|
||||||
|
TAG_OF_INTEREST: frozenset[str] = frozenset()
|
||||||
|
|
||||||
|
|
||||||
# ** export
|
# ** export
|
||||||
# ** preprocessing
|
# ** preprocessing
|
||||||
@ -74,7 +122,6 @@ THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity'
|
|||||||
# ** graph postprocessing
|
# ** graph postprocessing
|
||||||
EDGE_WEIGHT_DECIMALS: Final[int] = 4
|
EDGE_WEIGHT_DECIMALS: Final[int] = 4
|
||||||
THRESHOLD_EDGE_NUMBER: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_number']
|
THRESHOLD_EDGE_NUMBER: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_number']
|
||||||
# THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
|
|
||||||
PROPERTY_NAME_DEGREE_WEIGHTED: Final[str] = 'degree_weighted'
|
PROPERTY_NAME_DEGREE_WEIGHTED: Final[str] = 'degree_weighted'
|
||||||
|
|
||||||
# ** graph exports (Cytoscape)
|
# ** graph exports (Cytoscape)
|
||||||
|
|||||||
@ -14,3 +14,25 @@ class EmptyEdgesError(EmptyGraphError):
|
|||||||
|
|
||||||
class GraphRenderError(Exception):
|
class GraphRenderError(Exception):
|
||||||
"""Error raised if a graph object can not be rendered"""
|
"""Error raised if a graph object can not be rendered"""
|
||||||
|
|
||||||
|
|
||||||
|
class DependencyMissingError(Exception):
|
||||||
|
"""Error raised if needed dependency could not be found"""
|
||||||
|
|
||||||
|
|
||||||
|
# ** pipelines to perform given actions on dataset in a customisable manner
|
||||||
|
|
||||||
|
|
||||||
|
class NoPerformableActionError(Exception):
|
||||||
|
"""Error describing that no action is available in the current pipeline"""
|
||||||
|
|
||||||
|
|
||||||
|
class WrongActionTypeError(Exception):
|
||||||
|
"""Error raised if added action type is not supported by corresponding pipeline"""
|
||||||
|
|
||||||
|
|
||||||
|
class OutputInPipelineContainerError(Exception):
|
||||||
|
"""Error raised if an output was detected by one of the performed
|
||||||
|
actions in a PipelineContainer. Each action in a PipelineContainer is itself a
|
||||||
|
procedure which does not have any parameters or return values and should therefore not
|
||||||
|
return any values."""
|
||||||
|
|||||||
@ -7,6 +7,11 @@ inputs = './inputs/'
|
|||||||
results = './results/test_20240807/'
|
results = './results/test_20240807/'
|
||||||
dataset = '../data/02_202307/Export4.csv'
|
dataset = '../data/02_202307/Export4.csv'
|
||||||
|
|
||||||
|
[logging]
|
||||||
|
enabled = true
|
||||||
|
stderr = true
|
||||||
|
file = true
|
||||||
|
|
||||||
# only debugging features, production-ready pipelines should always
|
# only debugging features, production-ready pipelines should always
|
||||||
# be fully executed
|
# be fully executed
|
||||||
[control]
|
[control]
|
||||||
|
|||||||
@ -1,16 +1,65 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import logging.handlers
|
||||||
|
from pathlib import Path
|
||||||
|
from time import gmtime
|
||||||
from typing import Final
|
from typing import Final
|
||||||
|
|
||||||
|
from lang_main.constants import (
|
||||||
|
ENABLE_LOGGING,
|
||||||
|
LOGGING_TO_FILE,
|
||||||
|
LOGGING_TO_STDERR,
|
||||||
|
)
|
||||||
from lang_main.types import LoggingLevels
|
from lang_main.types import LoggingLevels
|
||||||
|
|
||||||
# ** logging
|
# ** config
|
||||||
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.INFO
|
logging.Formatter.converter = gmtime
|
||||||
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.INFO
|
LOG_FMT: Final[str] = '%(asctime)s | lang_main:%(module)s:%(levelname)s | %(message)s'
|
||||||
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.INFO
|
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
|
||||||
|
LOG_FILE_PATH: Final[Path] = Path.cwd() / 'lang-main.log'
|
||||||
|
# logging.basicConfig(
|
||||||
|
# format=LOG_FMT,
|
||||||
|
# datefmt=LOG_DATE_FMT,
|
||||||
|
# )
|
||||||
|
|
||||||
|
# ** formatters
|
||||||
|
logger_all_formater = logging.Formatter(fmt=LOG_FMT, datefmt=LOG_DATE_FMT)
|
||||||
|
|
||||||
|
# ** handlers
|
||||||
|
null_handler = logging.NullHandler()
|
||||||
|
if ENABLE_LOGGING and LOGGING_TO_STDERR:
|
||||||
|
logger_all_handler_stderr = logging.StreamHandler()
|
||||||
|
logger_all_handler_stderr.setLevel(LoggingLevels.WARNING)
|
||||||
|
logger_all_handler_stderr.setFormatter(logger_all_formater)
|
||||||
|
else:
|
||||||
|
logger_all_handler_stderr = null_handler
|
||||||
|
|
||||||
|
if ENABLE_LOGGING and LOGGING_TO_FILE:
|
||||||
|
logger_all_handler_file = logging.handlers.RotatingFileHandler(
|
||||||
|
LOG_FILE_PATH,
|
||||||
|
encoding='utf-8',
|
||||||
|
maxBytes=5_242_880,
|
||||||
|
backupCount=1,
|
||||||
|
)
|
||||||
|
logger_all_handler_file.setLevel(LoggingLevels.DEBUG)
|
||||||
|
logger_all_handler_file.setFormatter(logger_all_formater)
|
||||||
|
else:
|
||||||
|
logger_all_handler_file = null_handler
|
||||||
|
|
||||||
|
|
||||||
|
# ** logging levels
|
||||||
|
LOGGING_LEVEL_ALL: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||||
|
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||||
|
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||||
|
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||||
LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = LoggingLevels.DEBUG
|
LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||||
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.INFO
|
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||||
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.INFO
|
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||||
LOGGING_LEVEL_RENDERING: Final[LoggingLevels] = LoggingLevels.INFO
|
LOGGING_LEVEL_RENDERING: Final[LoggingLevels] = LoggingLevels.DEBUG
|
||||||
|
|
||||||
|
# ** loggers and configuration
|
||||||
|
logger_all = logging.getLogger('lang_main')
|
||||||
|
logger_all.addHandler(logger_all_handler_stderr)
|
||||||
|
logger_all.addHandler(logger_all_handler_file)
|
||||||
|
|
||||||
logger_shared_helpers = logging.getLogger('lang_main.shared')
|
logger_shared_helpers = logging.getLogger('lang_main.shared')
|
||||||
logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)
|
logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)
|
||||||
|
|||||||
@ -1,18 +1,28 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Literal, overload
|
from typing import (
|
||||||
|
TYPE_CHECKING,
|
||||||
|
Any,
|
||||||
|
Literal,
|
||||||
|
overload,
|
||||||
|
)
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
from lang_main.constants import STFR_SIMILARITY
|
||||||
from lang_main.types import (
|
from lang_main.types import (
|
||||||
LanguageModels,
|
LanguageModels,
|
||||||
Model,
|
Model,
|
||||||
ModelLoaderMap,
|
ModelLoaderMap,
|
||||||
SpacyModel,
|
SpacyModel,
|
||||||
|
STFRBackends,
|
||||||
STFRDeviceTypes,
|
STFRDeviceTypes,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from sentence_transformers import SimilarityFunction
|
||||||
|
|
||||||
|
|
||||||
@overload
|
@overload
|
||||||
def instantiate_model(
|
def instantiate_model(
|
||||||
@ -48,6 +58,15 @@ def load_spacy(
|
|||||||
|
|
||||||
def load_sentence_transformer(
|
def load_sentence_transformer(
|
||||||
model_name: str,
|
model_name: str,
|
||||||
device: STFRDeviceTypes,
|
similarity_func: SimilarityFunction = STFR_SIMILARITY,
|
||||||
|
backend: STFRBackends = STFRBackends.TORCH,
|
||||||
|
device: STFRDeviceTypes = STFRDeviceTypes.CPU,
|
||||||
|
model_kwargs: dict[str, Any] | None = None,
|
||||||
) -> SentenceTransformer:
|
) -> SentenceTransformer:
|
||||||
return SentenceTransformer(model_name_or_path=model_name, device=device)
|
return SentenceTransformer(
|
||||||
|
model_name_or_path=model_name,
|
||||||
|
similarity_fn_name=similarity_func,
|
||||||
|
backend=backend, # type: ignore Literal matches Enum
|
||||||
|
device=device,
|
||||||
|
model_kwargs=model_kwargs,
|
||||||
|
)
|
||||||
|
|||||||
@ -6,27 +6,15 @@ from pathlib import Path
|
|||||||
from typing import Any, Never, cast
|
from typing import Any, Never, cast
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
|
from lang_main.errors import (
|
||||||
|
NoPerformableActionError,
|
||||||
|
OutputInPipelineContainerError,
|
||||||
|
WrongActionTypeError,
|
||||||
|
)
|
||||||
from lang_main.io import load_pickle, save_pickle
|
from lang_main.io import load_pickle, save_pickle
|
||||||
from lang_main.loggers import logger_pipelines as logger
|
from lang_main.loggers import logger_pipelines as logger
|
||||||
from lang_main.types import ResultHandling
|
from lang_main.types import ResultHandling
|
||||||
|
|
||||||
# ** pipelines to perform given actions on dataset in a customisable manner
|
|
||||||
|
|
||||||
|
|
||||||
class NoPerformableActionError(Exception):
|
|
||||||
"""Error describing that no action is available in the current pipeline"""
|
|
||||||
|
|
||||||
|
|
||||||
class WrongActionTypeError(Exception):
|
|
||||||
"""Error raised if added action type is not supported by corresponding pipeline"""
|
|
||||||
|
|
||||||
|
|
||||||
class OutputInPipelineContainerError(Exception):
|
|
||||||
"""Error raised if an output was detected by one of the performed
|
|
||||||
actions in a PipelineContainer. Each action in a PipelineContainer is itself a
|
|
||||||
procedure which does not have any parameters or return values and should therefore not
|
|
||||||
return any values."""
|
|
||||||
|
|
||||||
|
|
||||||
class BasePipeline(ABC):
|
class BasePipeline(ABC):
|
||||||
def __init__(
|
def __init__(
|
||||||
|
|||||||
@ -42,7 +42,6 @@ from lang_main.constants import (
|
|||||||
UNIQUE_CRITERION_FEATURE,
|
UNIQUE_CRITERION_FEATURE,
|
||||||
)
|
)
|
||||||
from lang_main.pipelines.base import Pipeline
|
from lang_main.pipelines.base import Pipeline
|
||||||
from lang_main.render import cytoscape as cyto
|
|
||||||
from lang_main.types import EntryPoints, LanguageModels
|
from lang_main.types import EntryPoints, LanguageModels
|
||||||
|
|
||||||
# ** Models
|
# ** Models
|
||||||
@ -137,13 +136,6 @@ def build_tk_graph_post_pipe() -> Pipeline:
|
|||||||
pipe_graph_postprocessing = Pipeline(
|
pipe_graph_postprocessing = Pipeline(
|
||||||
name='Graph_Postprocessing', working_dir=SAVE_PATH_FOLDER
|
name='Graph_Postprocessing', working_dir=SAVE_PATH_FOLDER
|
||||||
)
|
)
|
||||||
# pipe_graph_postprocessing.add(
|
|
||||||
# graphs.filter_graph_by_edge_weight,
|
|
||||||
# {
|
|
||||||
# 'bound_lower': THRESHOLD_EDGE_WEIGHT,
|
|
||||||
# 'bound_upper': None,
|
|
||||||
# },
|
|
||||||
# )
|
|
||||||
pipe_graph_postprocessing.add(
|
pipe_graph_postprocessing.add(
|
||||||
graphs.filter_graph_by_number_edges,
|
graphs.filter_graph_by_number_edges,
|
||||||
{
|
{
|
||||||
@ -190,6 +182,10 @@ def build_tk_graph_render_pipe(
|
|||||||
export_folder: Path = SAVE_PATH_FOLDER,
|
export_folder: Path = SAVE_PATH_FOLDER,
|
||||||
base_network_name: str = CYTO_BASE_NETWORK_NAME,
|
base_network_name: str = CYTO_BASE_NETWORK_NAME,
|
||||||
) -> Pipeline:
|
) -> Pipeline:
|
||||||
|
# optional dependency: late import
|
||||||
|
# raises exception if necessary modules are not found
|
||||||
|
from lang_main.render import cytoscape as cyto
|
||||||
|
|
||||||
pipe_graph_rendering = Pipeline(
|
pipe_graph_rendering = Pipeline(
|
||||||
name='Graph_Static-Rendering',
|
name='Graph_Static-Rendering',
|
||||||
working_dir=SAVE_PATH_FOLDER,
|
working_dir=SAVE_PATH_FOLDER,
|
||||||
|
|||||||
@ -0,0 +1,7 @@
|
|||||||
|
from lang_main.constants import Dependencies
|
||||||
|
from lang_main.errors import DependencyMissingError
|
||||||
|
|
||||||
|
if not Dependencies.PY4C.value:
|
||||||
|
raise DependencyMissingError(
|
||||||
|
'The module >>render<< needs the package >>Py4Cytoscape<<. Package not found.'
|
||||||
|
)
|
||||||
@ -3,6 +3,7 @@ from collections.abc import Callable, Hashable
|
|||||||
from typing import (
|
from typing import (
|
||||||
Any,
|
Any,
|
||||||
Literal,
|
Literal,
|
||||||
|
NotRequired,
|
||||||
Required,
|
Required,
|
||||||
TypeAlias,
|
TypeAlias,
|
||||||
TypedDict,
|
TypedDict,
|
||||||
@ -40,6 +41,27 @@ class LanguageModels(enum.StrEnum):
|
|||||||
SPACY = enum.auto()
|
SPACY = enum.auto()
|
||||||
|
|
||||||
|
|
||||||
|
class ONNXExecutionProvider(enum.StrEnum):
|
||||||
|
CPU = 'CPUExecutionProvider'
|
||||||
|
|
||||||
|
|
||||||
|
class STFRModels(enum.StrEnum):
|
||||||
|
ALL_MPNET_BASE_V2 = 'all-mpnet-base-v2'
|
||||||
|
ALL_DISTILROBERTA_V1 = 'all-distilroberta-v1'
|
||||||
|
ALL_MINI_LM_L12_V2 = 'all-MiniLM-L12-v2'
|
||||||
|
ALL_MINI_LM_L6_V2 = 'all-MiniLM-L6-v2'
|
||||||
|
|
||||||
|
|
||||||
|
class STFRQuantFilenames(enum.StrEnum):
|
||||||
|
ONNX_Q_UINT8 = 'onnx/model_quint8_avx2.onnx'
|
||||||
|
|
||||||
|
|
||||||
|
class STFRModelArgs(TypedDict):
|
||||||
|
provider: NotRequired[ONNXExecutionProvider]
|
||||||
|
file_name: NotRequired[STFRQuantFilenames]
|
||||||
|
export: NotRequired[bool]
|
||||||
|
|
||||||
|
|
||||||
Model: TypeAlias = SentenceTransformer | SpacyModel
|
Model: TypeAlias = SentenceTransformer | SpacyModel
|
||||||
ModelLoaderFunc: TypeAlias = Callable[..., Model]
|
ModelLoaderFunc: TypeAlias = Callable[..., Model]
|
||||||
|
|
||||||
@ -52,7 +74,12 @@ class ModelLoaderInfo(TypedDict):
|
|||||||
ModelLoaderMap: TypeAlias = dict[LanguageModels, ModelLoaderInfo]
|
ModelLoaderMap: TypeAlias = dict[LanguageModels, ModelLoaderInfo]
|
||||||
|
|
||||||
|
|
||||||
# ** devices
|
class STFRBackends(enum.StrEnum):
|
||||||
|
TORCH = enum.auto()
|
||||||
|
ONNX = enum.auto()
|
||||||
|
OPENVINO = enum.auto()
|
||||||
|
|
||||||
|
|
||||||
class STFRDeviceTypes(enum.StrEnum):
|
class STFRDeviceTypes(enum.StrEnum):
|
||||||
CPU = enum.auto()
|
CPU = enum.auto()
|
||||||
GPU = enum.auto()
|
GPU = enum.auto()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user