major changes in several aspects

This commit is contained in:
Florian Förster 2024-11-07 17:30:33 +01:00
parent 27d40d5c99
commit a0ca71ea87
22 changed files with 1628 additions and 479 deletions

8
lang-main.code-workspace Normal file
View File

@ -0,0 +1,8 @@
{
"folders": [
{
"path": "."
}
],
"settings": {}
}

View File

@ -1236,7 +1236,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.11.10"
}
},
"nbformat": 4,

View File

@ -11,6 +11,251 @@
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6e1c76cc-6f99-484a-b73c-c99d9d567bbd",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 10,
"id": "f34b343b-a7b6-4a6d-8db4-7f2f71addc54",
"metadata": {},
"outputs": [],
"source": [
"from importlib.util import find_spec"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "59c32d49-5bad-4f48-b91f-f069487ff543",
"metadata": {},
"outputs": [],
"source": [
"importlib.util.find_spec('nunpy')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "dd3d9434-374d-42a4-9c08-230495d0f5a6",
"metadata": {},
"outputs": [],
"source": [
"_has_dash: bool = True if (find_spec('dash') and find_spec('kalido')) else False"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "4da22bea-d086-4097-86f6-37784b59e02b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_has_dash"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "dfc51145-759e-43ee-a850-580149679c5b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"_has_py4cyto"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbdbc7c1-4dd6-49d6-b570-3fc7eb60938f",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 18,
"id": "36aa5c9c-c29a-4a05-8854-30b086a9240e",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "b8afbf49-e61c-49ce-af53-694b01f90702",
"metadata": {},
"outputs": [],
"source": [
"curr_path = Path.cwd()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "c353967a-2702-4370-b4f2-914ac3b950a3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(curr_path.glob('./lang_main_consdfig.toml'))"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "48ee9b11-60fa-43d4-87de-c9a2172a563b",
"metadata": {},
"outputs": [],
"source": [
"pkg_dir = curr_path"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "02d404e1-a352-44f6-a4e9-96b6aed0c3a3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('A:/Arbeitsaufgaben/lang-main/notebooks')"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pkg_dir"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "fc9142c2-9e59-4113-811c-7a03068857f2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[WindowsPath('A:/Arbeitsaufgaben/lang-main/notebooks'),\n",
" WindowsPath('A:/Arbeitsaufgaben/lang-main'),\n",
" WindowsPath('A:/Arbeitsaufgaben'),\n",
" WindowsPath('A:/')]"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list((pkg_dir/'home.py').parents)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "ac1312bf-332f-4008-8951-c53e35f37990",
"metadata": {},
"outputs": [],
"source": [
"cfg_found = False\n",
"for it in range(len(pkg_dir.parents)):\n",
" search_path = pkg_dir.parents[it]\n",
" res = tuple(search_path.glob(f'lang_main*.toml'))\n",
" if res:\n",
" cfg_found = True\n",
" target = res[0]\n",
" break\n",
" if search_path.name == 'python':\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "a9bb96cc-f8a7-4749-ae2b-62e363d18f54",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cfg_found"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "58472aba-e887-4c50-857a-894c1c5c9003",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"WindowsPath('A:/Arbeitsaufgaben/lang-main/lang_main_config.toml')"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"target"
]
},
{
"cell_type": "code",
"execution_count": 2,
@ -11562,7 +11807,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.11.10"
}
},
"nbformat": 4,

1315
pdm.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -8,21 +8,31 @@ authors = [
dependencies = [
"pandas>=2.2.2",
"networkx>=3.3",
"spacy[lookups,transformers]>=3.7.4",
"sentence-transformers>=2.7.0",
"numpy<=1.26.4",
"spacy>=3.7.4",
"sentence-transformers[onnx]>=3.2.0",
"numpy>=1.26.4",
"pip>=24.0",
"typing-extensions>=4.12.2",
"plotly>=5.22.0",
"dash>=2.17.0",
"dash-cytoscape>=1.0.1",
"py4cytoscape>=1.9.0",
"kaleido==0.2.1",
"tqdm>=4.67.0",
"python-dateutil>=2.9.0.post0",
]
requires-python = ">=3.11"
readme = "README.md"
license = {text = "LicenseRef-Proprietary"}
[project.optional-dependencies]
dash = [
"dash-cytoscape>=1.0.2",
"dash>=2.18.2",
"kaleido==0.2.1",
]
plot = [
"kaleido==0.2.1",
"plotly>=5.24.1",
]
cytoscape = [
"py4cytoscape>=1.11.0",
]
[build-system]
requires = ["pdm-backend"]
build-backend = "pdm.backend"

View File

@ -1,9 +1,9 @@
import time
import webbrowser
from collections.abc import Collection, Iterable
from pathlib import Path
from threading import Thread
from typing import Any, Final, cast
from pathlib import Path
# import dash_cytoscape as cyto
import plotly.express as px

View File

@ -0,0 +1,58 @@
# lang_main: Config file
[paths]
inputs = './inputs/'
# results = './results/dummy_N_1000/'
# dataset = '../data/Dummy_Dataset_N_1000.csv'
results = './results/test_20240807/'
dataset = '../data/02_202307/Export4.csv'
# only debugging features, production-ready pipelines should always
# be fully executed
[control]
preprocessing_skip = true
token_analysis_skip = true
graph_postprocessing_skip = false
graph_rescaling_skip = false
graph_static_rendering_skip = true
time_analysis_skip = true
[preprocess]
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_number = 330
# threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
[time_analysis.preparation]
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
[time_analysis.model_input]
# input_features = [
# 'VorgangsTypName',
# 'VorgangsArtText',
# 'VorgangsBeschreibung',
# ]
input_features = [
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8

View File

@ -7,14 +7,19 @@ inputs = './inputs/'
results = './results/test_20240807/'
dataset = '../data/02_202307/Export4.csv'
[logging]
enabled = true
stderr = true
file = true
# only debugging features, production-ready pipelines should always
# be fully executed
[control]
preprocessing_skip = true
preprocessing_skip = false
token_analysis_skip = true
graph_postprocessing_skip = false
graph_rescaling_skip = false
graph_static_rendering_skip = false
graph_postprocessing_skip = true
graph_rescaling_skip = true
graph_static_rendering_skip = true
time_analysis_skip = true
[preprocess]

View File

@ -1,56 +1,127 @@
import logging
import os
import shutil
import sys
from pathlib import Path
from time import gmtime
from typing import Any, Final
_has_py4cyto: bool = True
try:
import py4cytoscape as p4c
except ImportError:
_has_py4cyto = False
from lang_main.io import load_toml_config
# ** py4cytoscape config
if _has_py4cyto:
p4c.set_summary_logger(False)
p4c.py4cytoscape_logger.detail_logger.setLevel('ERROR')
p4c.py4cytoscape_logger.detail_logger.removeHandler(p4c.py4cytoscape_logger.detail_handler)
p4c.py4cytoscape_logger.detail_logger.removeHandler(
p4c.py4cytoscape_logger.detail_handler
)
p4c.py4cytoscape_logger.detail_logger.addHandler(logging.NullHandler())
# ** lang-main config
logging.Formatter.converter = gmtime
LOG_FMT: Final[str] = '%(asctime)s | lang_main:%(module)s:%(levelname)s | %(message)s'
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
logging.basicConfig(
stream=sys.stdout,
format=LOG_FMT,
datefmt=LOG_DATE_FMT,
)
CONFIG_FILENAME: Final[str] = 'lang_main_config.toml'
CYTO_STYLESHEET_FILENAME: Final[str] = r'cytoscape_config/lang_main.xml'
USE_INTERNAL_CONFIG: Final[bool] = False
PREFER_INTERNAL_CONFIG: Final[bool] = False
pkg_dir = Path(__file__).parent
cfg_path_internal = (pkg_dir / CONFIG_FILENAME).resolve()
cyto_stylesheet_path = (pkg_dir / CYTO_STYLESHEET_FILENAME).resolve()
# ** load config data: internal/external
if USE_INTERNAL_CONFIG:
loaded_cfg = load_toml_config(path_to_toml=cfg_path_internal)
else:
cfg_path_external = (Path.cwd() / CONFIG_FILENAME).resolve()
if not cfg_path_external.exists():
shutil.copy(cfg_path_internal, cfg_path_external)
sys.exit(
(
'No config file was found. A new one with default values was created '
'in the execution path. Please fill in the necessary values and '
'restart the programm.'
)
)
# raise NotImplementedError("External config data not implemented yet.")
loaded_cfg = load_toml_config(path_to_toml=cfg_path_external)
CONFIG: Final[dict[str, Any]] = loaded_cfg.copy()
# ** load config data: internal/external
# look for external config first, if not found use internal one
def search_cwd(
glob_pattern: str = CONFIG_FILENAME,
) -> Path | None:
"""Searches the current working directory and looks for files
matching the glob pattern.
Returns the first match encountered.
Parameters
----------
glob_pattern : str, optional
pattern to look for, first match will be returned,
by default CONFIG_FILENAME
Returns
-------
Path | None
Path if corresponding object was found, None otherwise
"""
cfg_path: Path | None = None
res = tuple(Path.cwd().glob(glob_pattern))
if res:
cfg_path = res[0]
return cfg_path
def search_iterative(
starting_path: Path,
glob_pattern: str = CONFIG_FILENAME,
stop_folder_name: str | None = None,
) -> Path | None:
"""Iteratively searches the parent directories of the starting path
and look for files matching the glob pattern. The starting path is not
searched, only its parents. Therefore the starting path can also point
to a file. The folder in which it is placed in will be searched.
Returns the first match encountered.
Parameters
----------
starting_path : Path
non-inclusive starting path
glob_pattern : str, optional
pattern to look for, first match will be returned,
by default CONFIG_FILENAME
stop_folder_name : str, optional
name of the last folder in the directory tree to search, by default 'python'
Returns
-------
Path | None
Path if corresponding object was found, None otherwise
"""
cfg_path: Path | None = None
for it in range(len(starting_path.parents)):
search_path = starting_path.parents[it] # do not look in library folder
res = tuple(search_path.glob(glob_pattern))
if res:
cfg_path = res[0]
break
if stop_folder_name is not None and search_path.name == stop_folder_name:
# library is placed inside a whole python installation for deployment
# only look up to this folder
break
return cfg_path
def load_cfg() -> dict[str, Any]:
cfg_path: Path | None
if PREFER_INTERNAL_CONFIG:
cfg_path = cfg_path_internal
else:
cfg_path = search_cwd(glob_pattern=CONFIG_FILENAME)
if cfg_path is None:
cfg_path = search_iterative(
starting_path=pkg_dir,
glob_pattern=CONFIG_FILENAME,
stop_folder_name='python',
)
# backup: use internal config
if cfg_path is None:
cfg_path = cfg_path_internal
config = load_toml_config(path_to_toml=cfg_path)
return config.copy()
CONFIG: Final[dict[str, Any]] = load_cfg()
# ** Cytoscape configuration
# stylesheet
@ -64,7 +135,7 @@ CYTO_PATH_STYLESHEET: Final[Path] = cyto_stylesheet_path
# TODO check removal
# append Graphviz binary folder to system path if not already contained
if sys.platform == 'win32':
path = Path(r'C:\Program Files\Graphviz\bin')
if path.is_dir() and str(path).lower() not in os.environ['PATH'].lower():
os.environ['PATH'] += f';{path}'
# if sys.platform == 'win32':
# path = Path(r'C:\Program Files\Graphviz\bin')
# if path.is_dir() and str(path).lower() not in os.environ['PATH'].lower():
# os.environ['PATH'] += f';{path}'

View File

@ -386,7 +386,7 @@ def pipe_rescale_graph_edge_weights(
def normalise_array_linear(
array: npt.NDArray[np.float_],
array: npt.NDArray[np.float32],
) -> npt.NDArray[np.float32]:
"""apply standard linear normalisation
@ -445,7 +445,7 @@ def verify_property(
graph: Graph | DiGraph,
property: str,
) -> None:
for idx, (node_1, node_2) in enumerate(graph.edges):
for node_1, node_2 in graph.edges:
if property not in graph[node_1][node_2]:
raise EdgePropertyNotContainedError(
(

View File

@ -1,8 +1,10 @@
from __future__ import annotations
from collections.abc import Collection
from itertools import combinations
from math import factorial
from pathlib import Path
from typing import cast
from typing import TYPE_CHECKING, cast
import numpy as np
import pandas as pd
@ -21,15 +23,10 @@ from lang_main.analysis.shared import (
similar_index_groups,
)
from lang_main.loggers import logger_preprocess as logger
from lang_main.pipelines.base import Pipeline
from lang_main.types import Embedding, PandasIndex
# TODO removal
# pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
# pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
# pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
# pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
# pattern_whitespace = re.compile(r'[ ]{2,}')
if TYPE_CHECKING:
from lang_main.pipelines.base import Pipeline
# ** (1) dataset preparation: loading and simple preprocessing

View File

@ -5,8 +5,9 @@ from typing import cast
import networkx as nx
import numpy as np
import numpy.typing as npt
import sentence_transformers
import sentence_transformers.util
# import sentence_transformers # TODO check removal
# import sentence_transformers.util # TODO check removal
from networkx import Graph
from pandas import DataFrame, Series
from sentence_transformers import SentenceTransformer
@ -76,7 +77,6 @@ def candidates_by_index(
data_model_input: Series,
model: SentenceTransformer,
cos_sim_threshold: float = 0.5,
# ) -> Iterator[tuple[PandasIndex, PandasIndex]]:
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
"""function to filter candidate indices based on cosine similarity
using SentenceTransformer model in batch mode,
@ -111,7 +111,9 @@ def candidates_by_index(
),
)
# cosine similarity
cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
cos_sim = cast(npt.NDArray, model.similarity(embds, embds).numpy())
# TODO check removal
# cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
np.fill_diagonal(cos_sim, 0.0)
cos_sim = np.triu(cos_sim)
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)

View File

@ -11,6 +11,11 @@ from lang_main.analysis.graphs import (
TokenGraph,
update_graph,
)
from lang_main.constants import (
POS_INDIRECT,
POS_OF_INTEREST,
TAG_OF_INTEREST,
)
from lang_main.loggers import logger_token_analysis as logger
from lang_main.types import (
PandasIndex,
@ -19,24 +24,8 @@ from lang_main.types import (
SpacyToken,
)
# ** POS
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV'])
# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
# ** TAG
# TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
TAG_OF_INTEREST: frozenset[str] = frozenset()
# ** obtaining connection in texts
def pre_clean_word(string: str) -> str:
pattern = r'[^A-Za-zäöüÄÖÜ]+'
string = re.sub(pattern, '', string)
@ -49,7 +38,6 @@ def is_str_date(
string: str,
fuzzy: bool = False,
) -> bool:
# print(string)
try:
# check if string is a number
# if length is greater than 8, it is not a date

View File

@ -1,6 +1,10 @@
from enum import Enum # noqa: I001
from importlib.util import find_spec
from pathlib import Path
from typing import Final
from sentence_transformers import SimilarityFunction
from lang_main import CONFIG, CYTO_PATH_STYLESHEET
from lang_main import model_loader as m_load
from lang_main.types import (
@ -8,7 +12,12 @@ from lang_main.types import (
CytoLayouts,
LanguageModels,
ModelLoaderMap,
ONNXExecutionProvider, # noqa: F401
STFRBackends,
STFRDeviceTypes,
STFRModelArgs,
STFRModels,
STFRQuantFilenames, # noqa: F401
)
__all__ = [
@ -16,8 +25,23 @@ __all__ = [
'CYTO_PATH_STYLESHEET',
]
# ** dependencies
_has_py4cyto: bool = True if find_spec('py4cytoscape') else False
_has_dash: bool = True if (find_spec('dash') and find_spec('kaleido')) else False
_has_plotly: bool = True if (find_spec('plotly') and find_spec('kaleido')) else False
class Dependencies(Enum):
PY4C = _has_py4cyto
DASH = _has_dash
PLOT = _has_plotly
# ** logging
# graphs
ENABLE_LOGGING: Final[bool] = CONFIG['logging']['enabled']
LOGGING_TO_FILE: Final[bool] = CONFIG['logging']['file']
LOGGING_TO_STDERR: Final[bool] = CONFIG['logging']['stderr']
LOGGING_DEFAULT_GRAPHS: Final[bool] = False
# ** paths
@ -44,14 +68,25 @@ SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
# ** models
# ** loading
SPACY_MODEL_NAME: Final[str] = 'de_dep_news_trf'
STFR_MODEL_NAME: Final[str] = 'sentence-transformers/all-mpnet-base-v2'
STFR_MODEL_NAME: Final[STFRModels] = STFRModels.ALL_MPNET_BASE_V2
STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU
STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE
STFR_BACKEND: Final[STFRBackends] = STFRBackends.TORCH
STFR_MODEL_ARGS: Final[STFRModelArgs] = {}
# STFR_MODEL_ARGS: Final[STFRModelArgs] = {
# 'file_name': STFRQuantFilenames.ONNX_Q_UINT8,
# 'provider': ONNXExecutionProvider.CPU,
# 'export': False,
# }
MODEL_LOADER_MAP: Final[ModelLoaderMap] = {
LanguageModels.SENTENCE_TRANSFORMER: {
'func': m_load.load_sentence_transformer,
'kwargs': {
'model_name': STFR_MODEL_NAME,
'similarity_func': STFR_SIMILARITY,
'backend': STFR_BACKEND,
'device': STFR_DEVICE,
'model_kwargs': STFR_MODEL_ARGS,
},
},
LanguageModels.SPACY: {
@ -61,6 +96,19 @@ MODEL_LOADER_MAP: Final[ModelLoaderMap] = {
},
},
}
# ** language dependency analysis
# ** POS
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'ADJ', 'VERB', 'AUX'])
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'ADJ', 'VERB', 'AUX'])
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN'])
# POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX'])
POS_OF_INTEREST: frozenset[str] = frozenset(['NOUN', 'PROPN', 'VERB', 'AUX', 'ADV'])
# POS_INDIRECT: frozenset[str] = frozenset(['AUX', 'VERB'])
POS_INDIRECT: frozenset[str] = frozenset(['AUX'])
# ** TAG
# TAG_OF_INTEREST: frozenset[str] = frozenset(['ADJD'])
TAG_OF_INTEREST: frozenset[str] = frozenset()
# ** export
# ** preprocessing
@ -74,7 +122,6 @@ THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity'
# ** graph postprocessing
EDGE_WEIGHT_DECIMALS: Final[int] = 4
THRESHOLD_EDGE_NUMBER: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_number']
# THRESHOLD_EDGE_WEIGHT: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_weight']
PROPERTY_NAME_DEGREE_WEIGHTED: Final[str] = 'degree_weighted'
# ** graph exports (Cytoscape)

View File

@ -14,3 +14,25 @@ class EmptyEdgesError(EmptyGraphError):
class GraphRenderError(Exception):
"""Error raised if a graph object can not be rendered"""
class DependencyMissingError(Exception):
"""Error raised if needed dependency could not be found"""
# ** pipelines to perform given actions on dataset in a customisable manner
class NoPerformableActionError(Exception):
"""Error describing that no action is available in the current pipeline"""
class WrongActionTypeError(Exception):
"""Error raised if added action type is not supported by corresponding pipeline"""
class OutputInPipelineContainerError(Exception):
"""Error raised if an output was detected by one of the performed
actions in a PipelineContainer. Each action in a PipelineContainer is itself a
procedure which does not have any parameters or return values and should therefore not
return any values."""

View File

@ -7,6 +7,11 @@ inputs = './inputs/'
results = './results/test_20240807/'
dataset = '../data/02_202307/Export4.csv'
[logging]
enabled = true
stderr = true
file = true
# only debugging features, production-ready pipelines should always
# be fully executed
[control]

View File

@ -1,16 +1,65 @@
import logging
import logging.handlers
from pathlib import Path
from time import gmtime
from typing import Final
from lang_main.constants import (
ENABLE_LOGGING,
LOGGING_TO_FILE,
LOGGING_TO_STDERR,
)
from lang_main.types import LoggingLevels
# ** logging
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.INFO
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.INFO
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.INFO
# ** config
logging.Formatter.converter = gmtime
LOG_FMT: Final[str] = '%(asctime)s | lang_main:%(module)s:%(levelname)s | %(message)s'
LOG_DATE_FMT: Final[str] = '%Y-%m-%d %H:%M:%S +0000'
LOG_FILE_PATH: Final[Path] = Path.cwd() / 'lang-main.log'
# logging.basicConfig(
# format=LOG_FMT,
# datefmt=LOG_DATE_FMT,
# )
# ** formatters
logger_all_formater = logging.Formatter(fmt=LOG_FMT, datefmt=LOG_DATE_FMT)
# ** handlers
null_handler = logging.NullHandler()
if ENABLE_LOGGING and LOGGING_TO_STDERR:
logger_all_handler_stderr = logging.StreamHandler()
logger_all_handler_stderr.setLevel(LoggingLevels.WARNING)
logger_all_handler_stderr.setFormatter(logger_all_formater)
else:
logger_all_handler_stderr = null_handler
if ENABLE_LOGGING and LOGGING_TO_FILE:
logger_all_handler_file = logging.handlers.RotatingFileHandler(
LOG_FILE_PATH,
encoding='utf-8',
maxBytes=5_242_880,
backupCount=1,
)
logger_all_handler_file.setLevel(LoggingLevels.DEBUG)
logger_all_handler_file.setFormatter(logger_all_formater)
else:
logger_all_handler_file = null_handler
# ** logging levels
LOGGING_LEVEL_ALL: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_PREPROCESS: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_PIPELINES: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_GRAPHS: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_TIMELINE: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.INFO
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.INFO
LOGGING_LEVEL_RENDERING: Final[LoggingLevels] = LoggingLevels.INFO
LOGGING_LEVEL_TOKEN_ANALYSIS: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_SHARED_HELPERS: Final[LoggingLevels] = LoggingLevels.DEBUG
LOGGING_LEVEL_RENDERING: Final[LoggingLevels] = LoggingLevels.DEBUG
# ** loggers and configuration
logger_all = logging.getLogger('lang_main')
logger_all.addHandler(logger_all_handler_stderr)
logger_all.addHandler(logger_all_handler_file)
logger_shared_helpers = logging.getLogger('lang_main.shared')
logger_shared_helpers.setLevel(LOGGING_LEVEL_SHARED_HELPERS)

View File

@ -1,18 +1,28 @@
from __future__ import annotations
from typing import Literal, overload
from typing import (
TYPE_CHECKING,
Any,
Literal,
overload,
)
import spacy
from sentence_transformers import SentenceTransformer
from lang_main.constants import STFR_SIMILARITY
from lang_main.types import (
LanguageModels,
Model,
ModelLoaderMap,
SpacyModel,
STFRBackends,
STFRDeviceTypes,
)
if TYPE_CHECKING:
from sentence_transformers import SimilarityFunction
@overload
def instantiate_model(
@ -48,6 +58,15 @@ def load_spacy(
def load_sentence_transformer(
model_name: str,
device: STFRDeviceTypes,
similarity_func: SimilarityFunction = STFR_SIMILARITY,
backend: STFRBackends = STFRBackends.TORCH,
device: STFRDeviceTypes = STFRDeviceTypes.CPU,
model_kwargs: dict[str, Any] | None = None,
) -> SentenceTransformer:
return SentenceTransformer(model_name_or_path=model_name, device=device)
return SentenceTransformer(
model_name_or_path=model_name,
similarity_fn_name=similarity_func,
backend=backend, # type: ignore Literal matches Enum
device=device,
model_kwargs=model_kwargs,
)

View File

@ -6,27 +6,15 @@ from pathlib import Path
from typing import Any, Never, cast
from typing_extensions import override
from lang_main.errors import (
NoPerformableActionError,
OutputInPipelineContainerError,
WrongActionTypeError,
)
from lang_main.io import load_pickle, save_pickle
from lang_main.loggers import logger_pipelines as logger
from lang_main.types import ResultHandling
# ** pipelines to perform given actions on dataset in a customisable manner
class NoPerformableActionError(Exception):
"""Error describing that no action is available in the current pipeline"""
class WrongActionTypeError(Exception):
"""Error raised if added action type is not supported by corresponding pipeline"""
class OutputInPipelineContainerError(Exception):
"""Error raised if an output was detected by one of the performed
actions in a PipelineContainer. Each action in a PipelineContainer is itself a
procedure which does not have any parameters or return values and should therefore not
return any values."""
class BasePipeline(ABC):
def __init__(

View File

@ -42,7 +42,6 @@ from lang_main.constants import (
UNIQUE_CRITERION_FEATURE,
)
from lang_main.pipelines.base import Pipeline
from lang_main.render import cytoscape as cyto
from lang_main.types import EntryPoints, LanguageModels
# ** Models
@ -137,13 +136,6 @@ def build_tk_graph_post_pipe() -> Pipeline:
pipe_graph_postprocessing = Pipeline(
name='Graph_Postprocessing', working_dir=SAVE_PATH_FOLDER
)
# pipe_graph_postprocessing.add(
# graphs.filter_graph_by_edge_weight,
# {
# 'bound_lower': THRESHOLD_EDGE_WEIGHT,
# 'bound_upper': None,
# },
# )
pipe_graph_postprocessing.add(
graphs.filter_graph_by_number_edges,
{
@ -190,6 +182,10 @@ def build_tk_graph_render_pipe(
export_folder: Path = SAVE_PATH_FOLDER,
base_network_name: str = CYTO_BASE_NETWORK_NAME,
) -> Pipeline:
# optional dependency: late import
# raises exception if necessary modules are not found
from lang_main.render import cytoscape as cyto
pipe_graph_rendering = Pipeline(
name='Graph_Static-Rendering',
working_dir=SAVE_PATH_FOLDER,

View File

@ -0,0 +1,7 @@
from lang_main.constants import Dependencies
from lang_main.errors import DependencyMissingError
if not Dependencies.PY4C.value:
raise DependencyMissingError(
'The module >>render<< needs the package >>Py4Cytoscape<<. Package not found.'
)

View File

@ -3,6 +3,7 @@ from collections.abc import Callable, Hashable
from typing import (
Any,
Literal,
NotRequired,
Required,
TypeAlias,
TypedDict,
@ -40,6 +41,27 @@ class LanguageModels(enum.StrEnum):
SPACY = enum.auto()
class ONNXExecutionProvider(enum.StrEnum):
CPU = 'CPUExecutionProvider'
class STFRModels(enum.StrEnum):
ALL_MPNET_BASE_V2 = 'all-mpnet-base-v2'
ALL_DISTILROBERTA_V1 = 'all-distilroberta-v1'
ALL_MINI_LM_L12_V2 = 'all-MiniLM-L12-v2'
ALL_MINI_LM_L6_V2 = 'all-MiniLM-L6-v2'
class STFRQuantFilenames(enum.StrEnum):
ONNX_Q_UINT8 = 'onnx/model_quint8_avx2.onnx'
class STFRModelArgs(TypedDict):
provider: NotRequired[ONNXExecutionProvider]
file_name: NotRequired[STFRQuantFilenames]
export: NotRequired[bool]
Model: TypeAlias = SentenceTransformer | SpacyModel
ModelLoaderFunc: TypeAlias = Callable[..., Model]
@ -52,7 +74,12 @@ class ModelLoaderInfo(TypedDict):
ModelLoaderMap: TypeAlias = dict[LanguageModels, ModelLoaderInfo]
# ** devices
class STFRBackends(enum.StrEnum):
TORCH = enum.auto()
ONNX = enum.auto()
OPENVINO = enum.auto()
class STFRDeviceTypes(enum.StrEnum):
CPU = enum.auto()
GPU = enum.auto()