initial commit V0.1.0

This commit is contained in:
Florian Förster 2025-01-23 12:05:13 +01:00
commit 7786e2660c
26 changed files with 10127 additions and 0 deletions

162
.gitignore vendored Normal file
View File

@ -0,0 +1,162 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm-project.org/#use-with-ide
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

1
README.md Normal file
View File

@ -0,0 +1 @@
# tom-plugin

5034
benchmark/model_tests.ipynb Normal file

File diff suppressed because it is too large Load Diff

57
benchmark/stfr.py Normal file
View File

@ -0,0 +1,57 @@
import os
from pathlib import Path
os.environ['LANG_MAIN_STOP_SEARCH_FOLDERNAME'] = 'tom-plugin'
os.environ['LANG_MAIN_BASE_FOLDERNAME'] = 'tom-plugin'
from lang_main.constants import SimilarityFunction
from lang_main.model_loader import load_sentence_transformer as load_stfr
from lang_main.types import (
ONNXExecutionProvider,
STFRBackends,
STFRDeviceTypes,
STFRModelArgs,
TorchDTypes,
)
MODEL_NAME = 'mixedbread-ai/deepset-mxbai-embed-de-large-v1'
MODEL_ARGS: STFRModelArgs = {
# 'torch_dtype': 'float32',
'export': False,
# 'file_name': 'onnx/model_uint8.onnx', # type: ignore
'file_name': 'onnx/model_quantized.onnx', # type: ignore
'provider': ONNXExecutionProvider.CPU,
}
MODEL_PATH = Path(r'A:\Arbeitsaufgaben\lang-models')
def load_models(model_name: str, trust_remote: bool = False, use_onnx: bool = False):
assert MODEL_PATH.exists(), 'model path not existing'
if use_onnx:
model_kwargs = MODEL_ARGS
backend = STFRBackends.ONNX
else:
model_kwargs = {'torch_dtype': 'float32'}
backend = STFRBackends.TORCH
stfr_model = load_stfr(
model_name=model_name, # type: ignore
similarity_func=SimilarityFunction.COSINE,
backend=backend,
local_files_only=False,
trust_remote_code=trust_remote,
model_save_folder=str(MODEL_PATH),
model_kwargs=model_kwargs,
)
return stfr_model
def main():
load_models(MODEL_NAME)
if __name__ == '__main__':
main()

2
benchmark/test.toml Normal file
View File

@ -0,0 +1,2 @@
[test]
t1 = nan

1
build.ps1 Normal file
View File

@ -0,0 +1 @@
pdm build -d build/

2
bump_prerelease_num.ps1 Normal file
View File

@ -0,0 +1,2 @@
pdm run bump-my-version bump pre_n
pdm run bump-my-version show current_version

2
bump_release_type.ps1 Normal file
View File

@ -0,0 +1,2 @@
pdm run bump-my-version bump pre_l
pdm run bump-my-version show current_version

59
lang_main_config.toml Normal file
View File

@ -0,0 +1,59 @@
# d-opt -- lang_main: config file
[paths]
inputs = './lang-data/in/'
results = './lang-data/out/'
models = './lang-models/converted'
[models]
use_large_model = false
[logging]
enabled = true
stderr = true
file = true
# control which pipelines are executed
[control]
preprocessing_skip = false
token_analysis_skip = false
graph_postprocessing_skip = false
graph_rescaling_skip = false
graph_static_rendering_skip = true
time_analysis_skip = true
[preprocess]
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
target_feature = "VorgangsBeschreibung"
threshold_amount_characters = 5
threshold_similarity = 0.92
[graph_postprocessing]
max_edge_number = -1
[time_analysis.uniqueness]
threshold_unique_texts = 5
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
feature_name_obj_text = 'HObjektText'
[time_analysis.preparation]
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
[time_analysis.model_input]
input_features = [
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_activities = 1
threshold_similarity = 0.8

56
lang_main_config_old.toml Normal file
View File

@ -0,0 +1,56 @@
# d-opt -- lang_main: config file
[paths]
inputs = './lang-data/in/'
results = './lang-data/out/'
models = './lang-models/converted'
[logging]
enabled = true
stderr = true
file = true
# control which pipelines are executed
[control]
preprocessing_skip = false
token_analysis_skip = false
graph_postprocessing_skip = false
graph_rescaling_skip = false
graph_static_rendering_skip = true
time_analysis_skip = true
[preprocess]
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
target_feature = "VorgangsBeschreibung"
threshold_amount_characters = 5
threshold_similarity = 0.92
[graph_postprocessing]
max_edge_number = -1
[time_analysis.uniqueness]
threshold_unique_texts = 5
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
feature_name_obj_text = 'HObjektText'
[time_analysis.preparation]
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
[time_analysis.model_input]
input_features = [
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_activities = 1
threshold_similarity = 0.8

View File

@ -0,0 +1,63 @@
# lang_main: Config file
[paths]
inputs = './lang-data/in'
# results = './results/dummy_N_1000/'
# dataset = '../data/Dummy_Dataset_N_1000.csv'
results = './lang-data/out'
models = './lang-models/converted'
[logging]
enabled = true
stderr = true
file = true
# control which pipelines are executed
[control]
preprocessing_skip = false
token_analysis_skip = false
graph_postprocessing_skip = false
graph_rescaling_skip = false
graph_static_rendering_skip = false
time_analysis_skip = true
[preprocess]
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_number = 500
# threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
feature_name_obj_text = 'HObjektText'
[time_analysis.preparation]
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
[time_analysis.model_input]
# input_features = [
# 'VorgangsTypName',
# 'VorgangsArtText',
# 'VorgangsBeschreibung',
# ]
input_features = [
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8

3966
pdm.lock generated Normal file

File diff suppressed because it is too large Load Diff

1
publish.ps1 Normal file
View File

@ -0,0 +1 @@
pdm publish -r local --skip-existing

139
pyproject.toml Normal file
View File

@ -0,0 +1,139 @@
[project]
name = "tom-plugin"
version = "0.1.0"
description = "Wrapper for TOM plugins with different helper CLIs, primarily integration testing"
authors = [
{name = "d-opt GmbH, resp. Florian Förster", email = "f.foerster@d-opt.com"},
]
dependencies = ["lang-main[spacy-md,spacy-trf]>=0.1.0"]
requires-python = ">=3.11"
readme = "README.md"
license = {text = "MIT"}
[project.scripts]
model-download = "tom_plugin._tools._load_model:main"
pipeline-test = "tom_plugin._tools._run:main"
[build-system]
requires = ["pdm-backend"]
build-backend = "pdm.backend"
[tool.ruff]
line-length = 94
indent-width = 4
target-version = "py311"
src = ["src"]
[tool.ruff.format]
quote-style = "single"
skip-magic-trailing-comma = false
[tool.ruff.lint]
select = ["E", "F", "I"]
[tool.ruff.lint.isort]
extra-standard-library = ["typing_extensions"]
[tool.pytest.ini_options]
addopts = [
"-vvl",
"--import-mode=importlib",
]
testpaths = [
"tests",
]
filterwarnings = [
'ignore:pkg_resources is deprecated as an API.:DeprecationWarning'
]
markers = [
]
log_cli = true
[tool.coverage.run]
relative_files = true
source = [
"tom_plugin",
"tests/",
]
[tool.coverage.report]
exclude_also = [
"def __repr__",
"def __str__",
"@overload",
"if logging",
"if TYPE_CHECKING",
"@pytest.fixture",
"if __name__ == __main__:",
]
[tool.coverage.html]
directory = "reports/coverage"
[tool.pdm]
distribution = true
[tool.pdm.resolution]
respect-source-order = true
[[tool.pdm.source]]
name = "private"
url = "http://localhost:8001/simple"
verify_ssl = false
[[tool.pdm.source]]
name = "pypi"
url = "https://pypi.org/simple"
exclude_packages = ["lang-main*", "tom-plugin*"]
[dependency-groups]
dev = [
"bump-my-version>=0.29.0",
"jupyterlab>=4.3.4",
"ipywidgets>=8.1.5",
]
[tool.bumpversion]
current_version = "0.1.0"
parse = """(?x)
(?P<major>0|[1-9]\\d*)\\.
(?P<minor>0|[1-9]\\d*)\\.
(?P<patch>0|[1-9]\\d*)
(?:
# separator for pre-release section
(?P<pre_l>[a-zA-Z-]+) # pre-release label
(?P<pre_n>0|[1-9]\\d*) # pre-release version number
)? # pre-release section is optional
"""
serialize = [
"{major}.{minor}.{patch}{pre_l}{pre_n}",
"{major}.{minor}.{patch}",
]
search = "{current_version}"
replace = "{new_version}"
regex = false
ignore_missing_version = false
ignore_missing_files = false
tag = false
sign_tags = false
tag_name = "v{new_version}"
tag_message = "Bump version: {current_version} {new_version}"
allow_dirty = true
commit = false
message = "Bump version: {current_version} → {new_version}"
commit_args = ""
setup_hooks = []
pre_commit_hooks = []
post_commit_hooks = []
[tool.bumpversion.parts.pre_l]
values = ["dev", "a", "b", "rc", "final"]
optional_value = "final"
[[tool.bumpversion.files]]
filename = "pyproject.toml"
search = "version = \"{current_version}\""
replace = "version = \"{new_version}\""

View File

@ -0,0 +1,3 @@
from tom_plugin import _env_vars as env
env.set()

View File

@ -0,0 +1,46 @@
import os
from typing import Final
# ** ENV VARS
def set() -> None:
library_mode = os.environ.get('DOPT_TOM_PLUGIN_LIBRARY_USAGE', None)
LIBRARY_MODE: Final[bool] = bool(library_mode)
if LIBRARY_MODE:
_set_lib_mode()
else:
_set_app_mode(
spacy_model=None,
STFR_model=None,
)
def _set_lib_mode() -> None:
os.environ['LANG_MAIN_STFR_BACKEND'] = 'onnx'
def _set_app_mode(
spacy_model: str | None = None,
STFR_model: str | None = None,
) -> None:
os.environ['LANG_MAIN_STFR_BACKEND'] = 'onnx'
os.environ['LANG_MAIN_STOP_SEARCH_FOLDERNAME'] = 'tom-plugin'
os.environ['LANG_MAIN_BASE_FOLDERNAME'] = 'tom-plugin'
if spacy_model is not None:
_set_spacy_model(spacy_model)
if STFR_model is not None:
_set_STFR_model(STFR_model)
def _set_spacy_model(
model_name: str = 'de_core_news_md',
) -> None:
os.environ['LANG_MAIN_SPACY_MODEL'] = model_name
def _set_STFR_model(
model_name: str = 'all-mpnet-base-v2',
) -> None:
os.environ['LANG_MAIN_SPACY_MODEL'] = model_name

View File

View File

@ -0,0 +1,194 @@
import argparse
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Any, cast
from lang_main.constants import stfr_model_args_default
from lang_main.model_loader import (
MODEL_BASE_FOLDER,
STFR_BACKEND,
STFR_DEVICE,
STFR_MODEL_ARGS,
STFR_MODEL_NAME,
STFR_SIMILARITY,
load_sentence_transformer,
)
from lang_main.types import (
SentenceTransformer,
STFRBackends,
STFRModelArgs,
STFRONNXFilenames,
)
from sentence_transformers.backend import (
export_dynamic_quantized_onnx_model,
export_optimized_onnx_model,
)
@dataclass
class TypedArgumentParser:
default: bool
model: str
path: Path | None
convert: bool
optim: bool
quant: bool
def get_model_name_from_repo(
full_model_name: str,
) -> str:
return full_model_name.split('/')[-1]
def _preload_STFR_model(
model_name_repo: str,
backend: STFRBackends,
model_kwargs: STFRModelArgs | dict[str, Any] | None,
target_folder: Path | str | None,
) -> SentenceTransformer:
save_folder: str | None = None
if target_folder is not None:
save_folder = str(target_folder)
return load_sentence_transformer(
model_name=model_name_repo,
similarity_func=STFR_SIMILARITY,
backend=backend,
device=STFR_DEVICE,
model_kwargs=model_kwargs,
model_save_folder=save_folder,
local_files_only=False,
force_download=True,
)
def _load_config_STFR_model() -> None:
_ = _preload_STFR_model(
model_name_repo=STFR_MODEL_NAME,
backend=STFR_BACKEND,
model_kwargs=STFR_MODEL_ARGS,
target_folder=None,
)
def _model_conversion(
model_name_repo: str,
quant: bool,
optimise: bool,
target_folder: Path | None,
) -> None:
model_name = get_model_name_from_repo(model_name_repo)
base_folder: Path = MODEL_BASE_FOLDER
if target_folder is not None:
base_folder = target_folder
if base_folder.stem == 'converted':
export_folder = (base_folder / model_name).resolve()
else:
export_folder = (base_folder / 'converted' / model_name).resolve()
# attempt to download base model if not present
_ = _preload_STFR_model(
model_name_repo=model_name_repo,
backend=STFRBackends.TORCH,
model_kwargs=stfr_model_args_default,
target_folder=base_folder,
)
model_onnx = _preload_STFR_model(
model_name_repo=model_name_repo,
backend=STFRBackends.ONNX,
model_kwargs=None,
target_folder=base_folder,
)
model_onnx.save_pretrained(path=str(export_folder), safe_serialization=True)
path_export_onnx_base = export_folder / 'onnx' / 'model.onnx'
assert path_export_onnx_base.exists(), 'ONNX base weights not existing'
print(f'Saved converted ONNX model under: {path_export_onnx_base}')
if quant:
export_dynamic_quantized_onnx_model(
model_onnx, quantization_config='avx2', model_name_or_path=str(export_folder)
)
path_export_onnx_quant = export_folder / STFRONNXFilenames.ONNX_Q_UINT8
assert path_export_onnx_quant.exists(), 'ONNX quant weights not existing'
print(f'Saved quantised ONNX model under: {path_export_onnx_quant}')
os.remove(path_export_onnx_base)
if optimise:
export_optimized_onnx_model(
model_onnx, optimization_config='O3', model_name_or_path=str(export_folder)
)
path_export_onnx_optim = export_folder / STFRONNXFilenames.ONNX_OPT_O3
assert path_export_onnx_optim.exists(), 'ONNX optimised weights not existing'
print(f'Saved optimised ONNX model under: {path_export_onnx_optim}')
os.remove(path_export_onnx_base)
def main() -> None:
parser = argparse.ArgumentParser(
prog='STFR-Model-Loader',
description=(
'Helper program to pre-download SentenceTransformer models '
'and convert them to different formats if desired'
),
)
parser.add_argument(
'-d', '--default', action='store_true', help='load model from default config'
)
parser.add_argument(
'-m',
'--model',
default=STFR_MODEL_NAME,
help='model to load (full repo name from Hugging Face Hub)',
)
parser.add_argument(
'-p', '--path', type=Path, default=None, help='path to save models to'
)
parser.add_argument(
'-c', '--convert', action='store_true', help='convert model to ONNX format'
)
parser.add_argument(
'-o',
'--optim',
action='store_true',
help=(
'optimise ONNX model with O3 profile, , model is '
'always converted to ONNX beforehand'
),
)
# parser.add_argument('--onnx', action='store_true', help='use ONNX backend')
parser.add_argument(
'--quant',
action='store_true',
help=(
'quantise model with "AVX2" configuration, model is always '
'converted to ONNX beforehand'
),
)
args = cast(TypedArgumentParser, parser.parse_args())
use_default_model = args.default
convert_model = args.convert
optimise_model = args.optim
quantise_model = args.quant
if use_default_model and convert_model:
raise ValueError('Loading default model does not allow model conversion')
path_models: Path | None = None
if args.path is not None:
path_models = args.path.resolve()
assert path_models.exists(), 'model saving path not existing'
assert path_models.is_dir(), 'model saving path not a directory'
if args.default:
_load_config_STFR_model()
else:
_model_conversion(
model_name_repo=args.model,
quant=quantise_model,
optimise=optimise_model,
target_folder=path_models,
)

View File

@ -0,0 +1,29 @@
import argparse
import time
from datetime import timedelta
from tom_plugin.pipeline import run_on_csv_data
def main() -> None:
parser = argparse.ArgumentParser(
prog='TOM-Plugin-Demo-Runner',
description='integration testing of provided pipelines in TOM-Plugin',
)
subparsers = parser.add_subparsers(dest='subparser')
parser_csv = subparsers.add_parser('csv', help='run on CSV data')
parser_csv.add_argument('id', help='ID for data set')
parser_csv.add_argument('filename', help='filename from configured input directory')
args = parser.parse_args()
if args.subparser == 'csv':
t1 = time.perf_counter()
run_on_csv_data(args.id, args.filename)
t2 = time.perf_counter()
run_time = t2 - t1
td = timedelta(seconds=run_time)
print(f'Application runtime was. {td}')
if __name__ == '__main__':
main()

View File

@ -0,0 +1,2 @@
# list of all library's environment variables
DOPT_TOM_PLUGIN_LIBRARY_USAGE : indicate that this wrapper application is in library mode (used to set different environment variables)

276
src/tom_plugin/pipeline.py Normal file
View File

@ -0,0 +1,276 @@
from __future__ import annotations
import os
import typing
from pathlib import Path
from typing import cast
from lang_main.analysis.graphs import (
Graph,
TokenGraph,
save_to_GraphML,
)
from lang_main.constants import (
CYTO_BASE_NETWORK_NAME,
INPUT_PATH_FOLDER,
SAVE_PATH_FOLDER,
SKIP_GRAPH_POSTPROCESSING,
SKIP_GRAPH_RESCALING,
SKIP_GRAPH_STATIC_RENDERING,
SKIP_PREPROCESSING,
SKIP_TIME_ANALYSIS,
SKIP_TOKEN_ANALYSIS,
)
from lang_main.errors import DependencyMissingError
from lang_main.io import create_saving_folder, get_entry_point, load_pickle
from lang_main.pipelines.base import Pipeline
from lang_main.pipelines.predefined import (
build_base_target_feature_pipe,
build_merge_duplicates_pipe,
build_timeline_pipe,
build_tk_graph_pipe,
build_tk_graph_post_pipe,
build_tk_graph_render_pipe,
build_tk_graph_rescaling_pipe,
)
from lang_main.types import (
EntryPoints,
ObjectID,
PandasIndex,
SpacyDoc,
TimelineCandidates,
)
from pandas import DataFrame
# ** build pipelines
pipe_target_feat_on_csv = build_base_target_feature_pipe()
pipe_merge = build_merge_duplicates_pipe()
pipe_token_analysis = build_tk_graph_pipe()
pipe_graph_postprocessing = build_tk_graph_post_pipe()
pipe_graph_rescaling = build_tk_graph_rescaling_pipe(
save_result=True,
exit_point=EntryPoints.TK_GRAPH_ANALYSIS_RESCALED,
)
pipe_timeline = build_timeline_pipe()
pipe_static_graph_rendering: Pipeline | None = None
# rendering depending on optional dependencies
try:
pipe_static_graph_rendering = build_tk_graph_render_pipe(
with_subgraphs=True,
base_network_name=CYTO_BASE_NETWORK_NAME,
)
except (ImportError, DependencyMissingError):
pass
all_pipes: tuple[Pipeline | None, ...] = (
pipe_target_feat_on_csv,
pipe_merge,
pipe_token_analysis,
pipe_graph_postprocessing,
pipe_graph_rescaling,
pipe_static_graph_rendering,
pipe_timeline,
)
# ENV variable: LANG_MAIN_SAVE_FOLDER : path for saving folder of current run
# ENV variable: LANG_MAIN_INPUT_DATA : path for input data of current run
def get_save_folder() -> Path:
save_folder_env = os.environ.get('LANG_MAIN_SAVE_FOLDER', None)
assert save_folder_env is not None, 'saving folder not defined as ENV variable'
save_folder = Path(save_folder_env)
assert save_folder.exists(), 'save folder does not exist'
return save_folder
def get_path_to_dataset() -> Path:
data_pth_env = os.environ.get('LANG_MAIN_INPUT_DATA', None)
assert data_pth_env is not None, 'path to dataset not defined as ENV variable'
data_pth = Path(data_pth_env)
assert data_pth.exists(), 'path to dataset does not exist'
return data_pth
def _set_save_folder(
target_folder: Path,
) -> None:
# save_folder = get_save_folder()
for pipe in all_pipes:
if pipe is not None:
pipe.working_dir = target_folder
# ** preparation
def _prepare_run_on_csv(
id: str,
filename: str,
) -> tuple[Path, Path]:
# output directory for intermediate results
print(f'Saving path: {SAVE_PATH_FOLDER}', flush=True)
target_folder = SAVE_PATH_FOLDER / id
create_saving_folder(
saving_path_folder=target_folder,
overwrite_existing=True,
)
assert target_folder.exists(), 'target folder not existing after creation'
# data set
data_pth = (INPUT_PATH_FOLDER / filename).with_suffix('.csv')
assert data_pth.exists(), 'path to data not existing'
assert data_pth.is_file(), 'data is not a file'
print(f'Data path: {data_pth}', flush=True)
return target_folder, data_pth
# ** preprocessing pipeline
def _run_preprocessing_on_csv(
target_folder: Path,
data_pth: Path,
) -> Path:
# data_pth = get_path_to_dataset()
# run pipelines
ret = typing.cast(
tuple[DataFrame], pipe_target_feat_on_csv.run(starting_values=(data_pth,))
)
target_feat_data = ret[0]
_ = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(target_feat_data,)))
return target_folder
# ** token analysis
def _run_token_analysis(
target_folder: Path,
) -> Path:
# load entry point
# save_folder = get_save_folder()
entry_point_path = get_entry_point(target_folder, EntryPoints.TOKEN_ANALYSIS)
loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
preprocessed_data = loaded_results[0]
# build token graph
(tk_graph, _) = typing.cast(
tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
pipe_token_analysis.run(starting_values=(preprocessed_data,)),
)
tk_graph.to_GraphML(target_folder, filename='TokenGraph', directed=False)
return target_folder
def _run_graph_postprocessing(
target_folder: Path,
) -> Path:
# load entry point
# save_folder = get_save_folder()
entry_point_path = get_entry_point(target_folder, EntryPoints.TK_GRAPH_POST)
loaded_results = cast(
tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
load_pickle(entry_point_path),
)
tk_graph = loaded_results[0]
# filter graph by edge weight and remove single nodes (no connection)
ret = cast(tuple[TokenGraph], pipe_graph_postprocessing.run(starting_values=(tk_graph,)))
tk_graph_filtered = ret[0]
tk_graph_filtered.to_GraphML(
target_folder, filename='TokenGraph-filtered', directed=False
)
return target_folder
def _run_graph_edge_rescaling(
target_folder: Path,
) -> Path:
# load entry point
# save_folder = get_save_folder()
entry_point_path = get_entry_point(target_folder, EntryPoints.TK_GRAPH_ANALYSIS)
loaded_results = cast(
tuple[TokenGraph],
load_pickle(entry_point_path),
)
tk_graph = loaded_results[0]
tk_graph_rescaled, tk_graph_rescaled_undirected = cast(
tuple[TokenGraph, Graph], pipe_graph_rescaling.run(starting_values=(tk_graph,))
)
tk_graph_rescaled.to_GraphML(
target_folder, filename='TokenGraph-directed-rescaled', directed=False
)
save_to_GraphML(
tk_graph_rescaled_undirected,
saving_path=target_folder,
filename='TokenGraph-undirected-rescaled',
)
return target_folder
def _run_static_graph_rendering(
target_folder: Path,
) -> Path:
# load entry point
# save_folder = get_save_folder()
entry_point_path = get_entry_point(
target_folder,
EntryPoints.TK_GRAPH_ANALYSIS_RESCALED,
)
loaded_results = cast(
tuple[TokenGraph, Graph],
load_pickle(entry_point_path),
)
_ = loaded_results[0]
tk_graph_rescaled_undirected = loaded_results[1]
if pipe_static_graph_rendering is not None:
_ = pipe_static_graph_rendering.run(starting_values=(tk_graph_rescaled_undirected,))
return target_folder
# ** time analysis
def _run_time_analysis(
target_folder: Path,
) -> Path:
# load entry point
# save_folder = get_save_folder()
entry_point_path = get_entry_point(target_folder, EntryPoints.TIMELINE)
loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
preprocessed_data = loaded_results[0]
_ = cast(
tuple[TimelineCandidates, dict[ObjectID, str]],
pipe_timeline.run(starting_values=(preprocessed_data,)),
)
return target_folder
def _build_pipeline_container(
target_folder: Path,
) -> Pipeline:
# save_folder = get_save_folder()
# container = PipelineContainer(name='Pipeline-Container-Base', working_dir=target_folder)
container = Pipeline(name='Pipeline-Base', working_dir=target_folder)
container.add(_run_preprocessing_on_csv, skip=SKIP_PREPROCESSING)
container.add(_run_token_analysis, skip=SKIP_TOKEN_ANALYSIS)
container.add(_run_graph_postprocessing, skip=SKIP_GRAPH_POSTPROCESSING)
container.add(_run_graph_edge_rescaling, skip=SKIP_GRAPH_RESCALING)
container.add(_run_static_graph_rendering, skip=SKIP_GRAPH_STATIC_RENDERING)
container.add(_run_time_analysis, skip=SKIP_TIME_ANALYSIS)
return container
def run_on_csv_data(
id: str,
filename: str,
) -> None:
target_folder, data_pth = _prepare_run_on_csv(id=id, filename=filename)
_set_save_folder(target_folder)
procedure = _build_pipeline_container(target_folder)
procedure.run(starting_values=(target_folder, data_pth))

21
test.py Normal file
View File

@ -0,0 +1,21 @@
import os
os.environ['LANG_MAIN_STOP_SEARCH_FOLDERNAME'] = 'python'
os.environ['LANG_MAIN_BASE_FOLDERNAME'] = 'bin'
from tom_plugin import pipeline
def run_pipe() -> None:
# lang-data\in\Dummy_Dataset_N_1000.csv
# relative_path = r'.\lang-data\in\Dummy_Dataset_N_1000.csv'
# absolute_path = r'A:\Arbeitsaufgaben\lang-data\in\Dummy_Dataset_N_1000.csv'
filename: str = 'Dummy_Dataset_N_1000.csv'
# pipeline.run_on_csv_data(id='123', filename=relative_path)
# pipeline.run_on_csv_data(id='124', filename=absolute_path)
pipeline.run_on_csv_data(id='1234', filename=filename)
if __name__ == '__main__':
run_pipe()

0
tests/__init__.py Normal file
View File

View File

@ -0,0 +1,8 @@
{
"folders": [
{
"path": "."
}
],
"settings": {}
}

2
update_and_publish.ps1 Normal file
View File

@ -0,0 +1,2 @@
pdm update -u -x lang-main
pdm publish -r local --skip-existing

1
update_lang_main.ps1 Normal file
View File

@ -0,0 +1 @@
pdm update -u -x lang-main