initial commit V0.1.0
This commit is contained in:
commit
7786e2660c
162
.gitignore
vendored
Normal file
162
.gitignore
vendored
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm-project.org/#use-with-ide
|
||||||
|
.pdm.toml
|
||||||
|
.pdm-python
|
||||||
|
.pdm-build/
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
5034
benchmark/model_tests.ipynb
Normal file
5034
benchmark/model_tests.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
57
benchmark/stfr.py
Normal file
57
benchmark/stfr.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
os.environ['LANG_MAIN_STOP_SEARCH_FOLDERNAME'] = 'tom-plugin'
|
||||||
|
os.environ['LANG_MAIN_BASE_FOLDERNAME'] = 'tom-plugin'
|
||||||
|
|
||||||
|
from lang_main.constants import SimilarityFunction
|
||||||
|
from lang_main.model_loader import load_sentence_transformer as load_stfr
|
||||||
|
from lang_main.types import (
|
||||||
|
ONNXExecutionProvider,
|
||||||
|
STFRBackends,
|
||||||
|
STFRDeviceTypes,
|
||||||
|
STFRModelArgs,
|
||||||
|
TorchDTypes,
|
||||||
|
)
|
||||||
|
|
||||||
|
MODEL_NAME = 'mixedbread-ai/deepset-mxbai-embed-de-large-v1'
|
||||||
|
|
||||||
|
MODEL_ARGS: STFRModelArgs = {
|
||||||
|
# 'torch_dtype': 'float32',
|
||||||
|
'export': False,
|
||||||
|
# 'file_name': 'onnx/model_uint8.onnx', # type: ignore
|
||||||
|
'file_name': 'onnx/model_quantized.onnx', # type: ignore
|
||||||
|
'provider': ONNXExecutionProvider.CPU,
|
||||||
|
}
|
||||||
|
|
||||||
|
MODEL_PATH = Path(r'A:\Arbeitsaufgaben\lang-models')
|
||||||
|
|
||||||
|
|
||||||
|
def load_models(model_name: str, trust_remote: bool = False, use_onnx: bool = False):
|
||||||
|
assert MODEL_PATH.exists(), 'model path not existing'
|
||||||
|
if use_onnx:
|
||||||
|
model_kwargs = MODEL_ARGS
|
||||||
|
backend = STFRBackends.ONNX
|
||||||
|
else:
|
||||||
|
model_kwargs = {'torch_dtype': 'float32'}
|
||||||
|
backend = STFRBackends.TORCH
|
||||||
|
|
||||||
|
stfr_model = load_stfr(
|
||||||
|
model_name=model_name, # type: ignore
|
||||||
|
similarity_func=SimilarityFunction.COSINE,
|
||||||
|
backend=backend,
|
||||||
|
local_files_only=False,
|
||||||
|
trust_remote_code=trust_remote,
|
||||||
|
model_save_folder=str(MODEL_PATH),
|
||||||
|
model_kwargs=model_kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
return stfr_model
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
load_models(MODEL_NAME)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
2
benchmark/test.toml
Normal file
2
benchmark/test.toml
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
[test]
|
||||||
|
t1 = nan
|
||||||
2
bump_prerelease_num.ps1
Normal file
2
bump_prerelease_num.ps1
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
pdm run bump-my-version bump pre_n
|
||||||
|
pdm run bump-my-version show current_version
|
||||||
2
bump_release_type.ps1
Normal file
2
bump_release_type.ps1
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
pdm run bump-my-version bump pre_l
|
||||||
|
pdm run bump-my-version show current_version
|
||||||
59
lang_main_config.toml
Normal file
59
lang_main_config.toml
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
# d-opt -- lang_main: config file
|
||||||
|
|
||||||
|
[paths]
|
||||||
|
inputs = './lang-data/in/'
|
||||||
|
results = './lang-data/out/'
|
||||||
|
models = './lang-models/converted'
|
||||||
|
|
||||||
|
[models]
|
||||||
|
use_large_model = false
|
||||||
|
|
||||||
|
[logging]
|
||||||
|
enabled = true
|
||||||
|
stderr = true
|
||||||
|
file = true
|
||||||
|
|
||||||
|
# control which pipelines are executed
|
||||||
|
[control]
|
||||||
|
preprocessing_skip = false
|
||||||
|
token_analysis_skip = false
|
||||||
|
graph_postprocessing_skip = false
|
||||||
|
graph_rescaling_skip = false
|
||||||
|
graph_static_rendering_skip = true
|
||||||
|
time_analysis_skip = true
|
||||||
|
|
||||||
|
[preprocess]
|
||||||
|
date_cols = [
|
||||||
|
"VorgangsDatum",
|
||||||
|
"ErledigungsDatum",
|
||||||
|
"Arbeitsbeginn",
|
||||||
|
"ErstellungsDatum",
|
||||||
|
]
|
||||||
|
target_feature = "VorgangsBeschreibung"
|
||||||
|
threshold_amount_characters = 5
|
||||||
|
threshold_similarity = 0.92
|
||||||
|
|
||||||
|
[graph_postprocessing]
|
||||||
|
max_edge_number = -1
|
||||||
|
|
||||||
|
[time_analysis.uniqueness]
|
||||||
|
threshold_unique_texts = 5
|
||||||
|
criterion_feature = 'HObjektText'
|
||||||
|
feature_name_obj_id = 'ObjektID'
|
||||||
|
feature_name_obj_text = 'HObjektText'
|
||||||
|
|
||||||
|
[time_analysis.preparation]
|
||||||
|
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
|
||||||
|
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
|
||||||
|
|
||||||
|
[time_analysis.model_input]
|
||||||
|
input_features = [
|
||||||
|
'VorgangsBeschreibung',
|
||||||
|
]
|
||||||
|
activity_feature = 'VorgangsTypName'
|
||||||
|
activity_types = [
|
||||||
|
'Reparaturauftrag (Portal)',
|
||||||
|
'Störungsmeldung',
|
||||||
|
]
|
||||||
|
threshold_num_activities = 1
|
||||||
|
threshold_similarity = 0.8
|
||||||
56
lang_main_config_old.toml
Normal file
56
lang_main_config_old.toml
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
# d-opt -- lang_main: config file
|
||||||
|
|
||||||
|
[paths]
|
||||||
|
inputs = './lang-data/in/'
|
||||||
|
results = './lang-data/out/'
|
||||||
|
models = './lang-models/converted'
|
||||||
|
|
||||||
|
[logging]
|
||||||
|
enabled = true
|
||||||
|
stderr = true
|
||||||
|
file = true
|
||||||
|
|
||||||
|
# control which pipelines are executed
|
||||||
|
[control]
|
||||||
|
preprocessing_skip = false
|
||||||
|
token_analysis_skip = false
|
||||||
|
graph_postprocessing_skip = false
|
||||||
|
graph_rescaling_skip = false
|
||||||
|
graph_static_rendering_skip = true
|
||||||
|
time_analysis_skip = true
|
||||||
|
|
||||||
|
[preprocess]
|
||||||
|
date_cols = [
|
||||||
|
"VorgangsDatum",
|
||||||
|
"ErledigungsDatum",
|
||||||
|
"Arbeitsbeginn",
|
||||||
|
"ErstellungsDatum",
|
||||||
|
]
|
||||||
|
target_feature = "VorgangsBeschreibung"
|
||||||
|
threshold_amount_characters = 5
|
||||||
|
threshold_similarity = 0.92
|
||||||
|
|
||||||
|
[graph_postprocessing]
|
||||||
|
max_edge_number = -1
|
||||||
|
|
||||||
|
[time_analysis.uniqueness]
|
||||||
|
threshold_unique_texts = 5
|
||||||
|
criterion_feature = 'HObjektText'
|
||||||
|
feature_name_obj_id = 'ObjektID'
|
||||||
|
feature_name_obj_text = 'HObjektText'
|
||||||
|
|
||||||
|
[time_analysis.preparation]
|
||||||
|
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
|
||||||
|
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
|
||||||
|
|
||||||
|
[time_analysis.model_input]
|
||||||
|
input_features = [
|
||||||
|
'VorgangsBeschreibung',
|
||||||
|
]
|
||||||
|
activity_feature = 'VorgangsTypName'
|
||||||
|
activity_types = [
|
||||||
|
'Reparaturauftrag (Portal)',
|
||||||
|
'Störungsmeldung',
|
||||||
|
]
|
||||||
|
threshold_num_activities = 1
|
||||||
|
threshold_similarity = 0.8
|
||||||
63
lang_main_config_old2.toml
Normal file
63
lang_main_config_old2.toml
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
# lang_main: Config file
|
||||||
|
|
||||||
|
[paths]
|
||||||
|
inputs = './lang-data/in'
|
||||||
|
# results = './results/dummy_N_1000/'
|
||||||
|
# dataset = '../data/Dummy_Dataset_N_1000.csv'
|
||||||
|
results = './lang-data/out'
|
||||||
|
models = './lang-models/converted'
|
||||||
|
|
||||||
|
[logging]
|
||||||
|
enabled = true
|
||||||
|
stderr = true
|
||||||
|
file = true
|
||||||
|
|
||||||
|
# control which pipelines are executed
|
||||||
|
[control]
|
||||||
|
preprocessing_skip = false
|
||||||
|
token_analysis_skip = false
|
||||||
|
graph_postprocessing_skip = false
|
||||||
|
graph_rescaling_skip = false
|
||||||
|
graph_static_rendering_skip = false
|
||||||
|
time_analysis_skip = true
|
||||||
|
|
||||||
|
[preprocess]
|
||||||
|
date_cols = [
|
||||||
|
"VorgangsDatum",
|
||||||
|
"ErledigungsDatum",
|
||||||
|
"Arbeitsbeginn",
|
||||||
|
"ErstellungsDatum",
|
||||||
|
]
|
||||||
|
threshold_amount_characters = 5
|
||||||
|
threshold_similarity = 0.8
|
||||||
|
|
||||||
|
[graph_postprocessing]
|
||||||
|
threshold_edge_number = 500
|
||||||
|
# threshold_edge_weight = 150
|
||||||
|
|
||||||
|
[time_analysis.uniqueness]
|
||||||
|
threshold_unique_texts = 4
|
||||||
|
criterion_feature = 'HObjektText'
|
||||||
|
feature_name_obj_id = 'ObjektID'
|
||||||
|
feature_name_obj_text = 'HObjektText'
|
||||||
|
|
||||||
|
[time_analysis.preparation]
|
||||||
|
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
|
||||||
|
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
|
||||||
|
|
||||||
|
[time_analysis.model_input]
|
||||||
|
# input_features = [
|
||||||
|
# 'VorgangsTypName',
|
||||||
|
# 'VorgangsArtText',
|
||||||
|
# 'VorgangsBeschreibung',
|
||||||
|
# ]
|
||||||
|
input_features = [
|
||||||
|
'VorgangsBeschreibung',
|
||||||
|
]
|
||||||
|
activity_feature = 'VorgangsTypName'
|
||||||
|
activity_types = [
|
||||||
|
'Reparaturauftrag (Portal)',
|
||||||
|
'Störungsmeldung',
|
||||||
|
]
|
||||||
|
threshold_num_acitivities = 1
|
||||||
|
threshold_similarity = 0.8
|
||||||
1
publish.ps1
Normal file
1
publish.ps1
Normal file
@ -0,0 +1 @@
|
|||||||
|
pdm publish -r local --skip-existing
|
||||||
139
pyproject.toml
Normal file
139
pyproject.toml
Normal file
@ -0,0 +1,139 @@
|
|||||||
|
[project]
|
||||||
|
name = "tom-plugin"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Wrapper for TOM plugins with different helper CLIs, primarily integration testing"
|
||||||
|
authors = [
|
||||||
|
{name = "d-opt GmbH, resp. Florian Förster", email = "f.foerster@d-opt.com"},
|
||||||
|
]
|
||||||
|
dependencies = ["lang-main[spacy-md,spacy-trf]>=0.1.0"]
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
readme = "README.md"
|
||||||
|
license = {text = "MIT"}
|
||||||
|
|
||||||
|
[project.scripts]
|
||||||
|
model-download = "tom_plugin._tools._load_model:main"
|
||||||
|
pipeline-test = "tom_plugin._tools._run:main"
|
||||||
|
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["pdm-backend"]
|
||||||
|
build-backend = "pdm.backend"
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 94
|
||||||
|
indent-width = 4
|
||||||
|
target-version = "py311"
|
||||||
|
src = ["src"]
|
||||||
|
|
||||||
|
[tool.ruff.format]
|
||||||
|
quote-style = "single"
|
||||||
|
skip-magic-trailing-comma = false
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = ["E", "F", "I"]
|
||||||
|
|
||||||
|
[tool.ruff.lint.isort]
|
||||||
|
extra-standard-library = ["typing_extensions"]
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
addopts = [
|
||||||
|
"-vvl",
|
||||||
|
"--import-mode=importlib",
|
||||||
|
]
|
||||||
|
testpaths = [
|
||||||
|
"tests",
|
||||||
|
]
|
||||||
|
filterwarnings = [
|
||||||
|
'ignore:pkg_resources is deprecated as an API.:DeprecationWarning'
|
||||||
|
]
|
||||||
|
markers = [
|
||||||
|
]
|
||||||
|
log_cli = true
|
||||||
|
|
||||||
|
[tool.coverage.run]
|
||||||
|
relative_files = true
|
||||||
|
source = [
|
||||||
|
"tom_plugin",
|
||||||
|
"tests/",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.coverage.report]
|
||||||
|
exclude_also = [
|
||||||
|
"def __repr__",
|
||||||
|
"def __str__",
|
||||||
|
"@overload",
|
||||||
|
"if logging",
|
||||||
|
"if TYPE_CHECKING",
|
||||||
|
"@pytest.fixture",
|
||||||
|
"if __name__ == __main__:",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.coverage.html]
|
||||||
|
directory = "reports/coverage"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
[tool.pdm]
|
||||||
|
distribution = true
|
||||||
|
|
||||||
|
[tool.pdm.resolution]
|
||||||
|
respect-source-order = true
|
||||||
|
|
||||||
|
[[tool.pdm.source]]
|
||||||
|
name = "private"
|
||||||
|
url = "http://localhost:8001/simple"
|
||||||
|
verify_ssl = false
|
||||||
|
|
||||||
|
[[tool.pdm.source]]
|
||||||
|
name = "pypi"
|
||||||
|
url = "https://pypi.org/simple"
|
||||||
|
exclude_packages = ["lang-main*", "tom-plugin*"]
|
||||||
|
|
||||||
|
[dependency-groups]
|
||||||
|
dev = [
|
||||||
|
"bump-my-version>=0.29.0",
|
||||||
|
"jupyterlab>=4.3.4",
|
||||||
|
"ipywidgets>=8.1.5",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.bumpversion]
|
||||||
|
current_version = "0.1.0"
|
||||||
|
parse = """(?x)
|
||||||
|
(?P<major>0|[1-9]\\d*)\\.
|
||||||
|
(?P<minor>0|[1-9]\\d*)\\.
|
||||||
|
(?P<patch>0|[1-9]\\d*)
|
||||||
|
(?:
|
||||||
|
# separator for pre-release section
|
||||||
|
(?P<pre_l>[a-zA-Z-]+) # pre-release label
|
||||||
|
(?P<pre_n>0|[1-9]\\d*) # pre-release version number
|
||||||
|
)? # pre-release section is optional
|
||||||
|
"""
|
||||||
|
serialize = [
|
||||||
|
"{major}.{minor}.{patch}{pre_l}{pre_n}",
|
||||||
|
"{major}.{minor}.{patch}",
|
||||||
|
]
|
||||||
|
search = "{current_version}"
|
||||||
|
replace = "{new_version}"
|
||||||
|
regex = false
|
||||||
|
ignore_missing_version = false
|
||||||
|
ignore_missing_files = false
|
||||||
|
tag = false
|
||||||
|
sign_tags = false
|
||||||
|
tag_name = "v{new_version}"
|
||||||
|
tag_message = "Bump version: {current_version} → {new_version}"
|
||||||
|
allow_dirty = true
|
||||||
|
commit = false
|
||||||
|
message = "Bump version: {current_version} → {new_version}"
|
||||||
|
commit_args = ""
|
||||||
|
setup_hooks = []
|
||||||
|
pre_commit_hooks = []
|
||||||
|
post_commit_hooks = []
|
||||||
|
|
||||||
|
[tool.bumpversion.parts.pre_l]
|
||||||
|
values = ["dev", "a", "b", "rc", "final"]
|
||||||
|
optional_value = "final"
|
||||||
|
|
||||||
|
[[tool.bumpversion.files]]
|
||||||
|
filename = "pyproject.toml"
|
||||||
|
search = "version = \"{current_version}\""
|
||||||
|
replace = "version = \"{new_version}\""
|
||||||
3
src/tom_plugin/__init__.py
Normal file
3
src/tom_plugin/__init__.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
from tom_plugin import _env_vars as env
|
||||||
|
|
||||||
|
env.set()
|
||||||
46
src/tom_plugin/_env_vars.py
Normal file
46
src/tom_plugin/_env_vars.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
import os
|
||||||
|
from typing import Final
|
||||||
|
|
||||||
|
|
||||||
|
# ** ENV VARS
|
||||||
|
def set() -> None:
|
||||||
|
library_mode = os.environ.get('DOPT_TOM_PLUGIN_LIBRARY_USAGE', None)
|
||||||
|
LIBRARY_MODE: Final[bool] = bool(library_mode)
|
||||||
|
|
||||||
|
if LIBRARY_MODE:
|
||||||
|
_set_lib_mode()
|
||||||
|
else:
|
||||||
|
_set_app_mode(
|
||||||
|
spacy_model=None,
|
||||||
|
STFR_model=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _set_lib_mode() -> None:
|
||||||
|
os.environ['LANG_MAIN_STFR_BACKEND'] = 'onnx'
|
||||||
|
|
||||||
|
|
||||||
|
def _set_app_mode(
|
||||||
|
spacy_model: str | None = None,
|
||||||
|
STFR_model: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
os.environ['LANG_MAIN_STFR_BACKEND'] = 'onnx'
|
||||||
|
os.environ['LANG_MAIN_STOP_SEARCH_FOLDERNAME'] = 'tom-plugin'
|
||||||
|
os.environ['LANG_MAIN_BASE_FOLDERNAME'] = 'tom-plugin'
|
||||||
|
|
||||||
|
if spacy_model is not None:
|
||||||
|
_set_spacy_model(spacy_model)
|
||||||
|
if STFR_model is not None:
|
||||||
|
_set_STFR_model(STFR_model)
|
||||||
|
|
||||||
|
|
||||||
|
def _set_spacy_model(
|
||||||
|
model_name: str = 'de_core_news_md',
|
||||||
|
) -> None:
|
||||||
|
os.environ['LANG_MAIN_SPACY_MODEL'] = model_name
|
||||||
|
|
||||||
|
|
||||||
|
def _set_STFR_model(
|
||||||
|
model_name: str = 'all-mpnet-base-v2',
|
||||||
|
) -> None:
|
||||||
|
os.environ['LANG_MAIN_SPACY_MODEL'] = model_name
|
||||||
0
src/tom_plugin/_tools/__init__.py
Normal file
0
src/tom_plugin/_tools/__init__.py
Normal file
194
src/tom_plugin/_tools/_load_model.py
Normal file
194
src/tom_plugin/_tools/_load_model.py
Normal file
@ -0,0 +1,194 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, cast
|
||||||
|
|
||||||
|
from lang_main.constants import stfr_model_args_default
|
||||||
|
from lang_main.model_loader import (
|
||||||
|
MODEL_BASE_FOLDER,
|
||||||
|
STFR_BACKEND,
|
||||||
|
STFR_DEVICE,
|
||||||
|
STFR_MODEL_ARGS,
|
||||||
|
STFR_MODEL_NAME,
|
||||||
|
STFR_SIMILARITY,
|
||||||
|
load_sentence_transformer,
|
||||||
|
)
|
||||||
|
from lang_main.types import (
|
||||||
|
SentenceTransformer,
|
||||||
|
STFRBackends,
|
||||||
|
STFRModelArgs,
|
||||||
|
STFRONNXFilenames,
|
||||||
|
)
|
||||||
|
from sentence_transformers.backend import (
|
||||||
|
export_dynamic_quantized_onnx_model,
|
||||||
|
export_optimized_onnx_model,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TypedArgumentParser:
|
||||||
|
default: bool
|
||||||
|
model: str
|
||||||
|
path: Path | None
|
||||||
|
convert: bool
|
||||||
|
optim: bool
|
||||||
|
quant: bool
|
||||||
|
|
||||||
|
|
||||||
|
def get_model_name_from_repo(
|
||||||
|
full_model_name: str,
|
||||||
|
) -> str:
|
||||||
|
return full_model_name.split('/')[-1]
|
||||||
|
|
||||||
|
|
||||||
|
def _preload_STFR_model(
|
||||||
|
model_name_repo: str,
|
||||||
|
backend: STFRBackends,
|
||||||
|
model_kwargs: STFRModelArgs | dict[str, Any] | None,
|
||||||
|
target_folder: Path | str | None,
|
||||||
|
) -> SentenceTransformer:
|
||||||
|
save_folder: str | None = None
|
||||||
|
if target_folder is not None:
|
||||||
|
save_folder = str(target_folder)
|
||||||
|
|
||||||
|
return load_sentence_transformer(
|
||||||
|
model_name=model_name_repo,
|
||||||
|
similarity_func=STFR_SIMILARITY,
|
||||||
|
backend=backend,
|
||||||
|
device=STFR_DEVICE,
|
||||||
|
model_kwargs=model_kwargs,
|
||||||
|
model_save_folder=save_folder,
|
||||||
|
local_files_only=False,
|
||||||
|
force_download=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_config_STFR_model() -> None:
|
||||||
|
_ = _preload_STFR_model(
|
||||||
|
model_name_repo=STFR_MODEL_NAME,
|
||||||
|
backend=STFR_BACKEND,
|
||||||
|
model_kwargs=STFR_MODEL_ARGS,
|
||||||
|
target_folder=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _model_conversion(
|
||||||
|
model_name_repo: str,
|
||||||
|
quant: bool,
|
||||||
|
optimise: bool,
|
||||||
|
target_folder: Path | None,
|
||||||
|
) -> None:
|
||||||
|
model_name = get_model_name_from_repo(model_name_repo)
|
||||||
|
base_folder: Path = MODEL_BASE_FOLDER
|
||||||
|
if target_folder is not None:
|
||||||
|
base_folder = target_folder
|
||||||
|
|
||||||
|
if base_folder.stem == 'converted':
|
||||||
|
export_folder = (base_folder / model_name).resolve()
|
||||||
|
else:
|
||||||
|
export_folder = (base_folder / 'converted' / model_name).resolve()
|
||||||
|
|
||||||
|
# attempt to download base model if not present
|
||||||
|
_ = _preload_STFR_model(
|
||||||
|
model_name_repo=model_name_repo,
|
||||||
|
backend=STFRBackends.TORCH,
|
||||||
|
model_kwargs=stfr_model_args_default,
|
||||||
|
target_folder=base_folder,
|
||||||
|
)
|
||||||
|
|
||||||
|
model_onnx = _preload_STFR_model(
|
||||||
|
model_name_repo=model_name_repo,
|
||||||
|
backend=STFRBackends.ONNX,
|
||||||
|
model_kwargs=None,
|
||||||
|
target_folder=base_folder,
|
||||||
|
)
|
||||||
|
model_onnx.save_pretrained(path=str(export_folder), safe_serialization=True)
|
||||||
|
path_export_onnx_base = export_folder / 'onnx' / 'model.onnx'
|
||||||
|
assert path_export_onnx_base.exists(), 'ONNX base weights not existing'
|
||||||
|
print(f'Saved converted ONNX model under: {path_export_onnx_base}')
|
||||||
|
|
||||||
|
if quant:
|
||||||
|
export_dynamic_quantized_onnx_model(
|
||||||
|
model_onnx, quantization_config='avx2', model_name_or_path=str(export_folder)
|
||||||
|
)
|
||||||
|
path_export_onnx_quant = export_folder / STFRONNXFilenames.ONNX_Q_UINT8
|
||||||
|
assert path_export_onnx_quant.exists(), 'ONNX quant weights not existing'
|
||||||
|
print(f'Saved quantised ONNX model under: {path_export_onnx_quant}')
|
||||||
|
os.remove(path_export_onnx_base)
|
||||||
|
if optimise:
|
||||||
|
export_optimized_onnx_model(
|
||||||
|
model_onnx, optimization_config='O3', model_name_or_path=str(export_folder)
|
||||||
|
)
|
||||||
|
path_export_onnx_optim = export_folder / STFRONNXFilenames.ONNX_OPT_O3
|
||||||
|
assert path_export_onnx_optim.exists(), 'ONNX optimised weights not existing'
|
||||||
|
print(f'Saved optimised ONNX model under: {path_export_onnx_optim}')
|
||||||
|
os.remove(path_export_onnx_base)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog='STFR-Model-Loader',
|
||||||
|
description=(
|
||||||
|
'Helper program to pre-download SentenceTransformer models '
|
||||||
|
'and convert them to different formats if desired'
|
||||||
|
),
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-d', '--default', action='store_true', help='load model from default config'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-m',
|
||||||
|
'--model',
|
||||||
|
default=STFR_MODEL_NAME,
|
||||||
|
help='model to load (full repo name from Hugging Face Hub)',
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-p', '--path', type=Path, default=None, help='path to save models to'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-c', '--convert', action='store_true', help='convert model to ONNX format'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-o',
|
||||||
|
'--optim',
|
||||||
|
action='store_true',
|
||||||
|
help=(
|
||||||
|
'optimise ONNX model with O3 profile, , model is '
|
||||||
|
'always converted to ONNX beforehand'
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# parser.add_argument('--onnx', action='store_true', help='use ONNX backend')
|
||||||
|
parser.add_argument(
|
||||||
|
'--quant',
|
||||||
|
action='store_true',
|
||||||
|
help=(
|
||||||
|
'quantise model with "AVX2" configuration, model is always '
|
||||||
|
'converted to ONNX beforehand'
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
args = cast(TypedArgumentParser, parser.parse_args())
|
||||||
|
use_default_model = args.default
|
||||||
|
convert_model = args.convert
|
||||||
|
optimise_model = args.optim
|
||||||
|
quantise_model = args.quant
|
||||||
|
|
||||||
|
if use_default_model and convert_model:
|
||||||
|
raise ValueError('Loading default model does not allow model conversion')
|
||||||
|
|
||||||
|
path_models: Path | None = None
|
||||||
|
if args.path is not None:
|
||||||
|
path_models = args.path.resolve()
|
||||||
|
assert path_models.exists(), 'model saving path not existing'
|
||||||
|
assert path_models.is_dir(), 'model saving path not a directory'
|
||||||
|
|
||||||
|
if args.default:
|
||||||
|
_load_config_STFR_model()
|
||||||
|
else:
|
||||||
|
_model_conversion(
|
||||||
|
model_name_repo=args.model,
|
||||||
|
quant=quantise_model,
|
||||||
|
optimise=optimise_model,
|
||||||
|
target_folder=path_models,
|
||||||
|
)
|
||||||
29
src/tom_plugin/_tools/_run.py
Normal file
29
src/tom_plugin/_tools/_run.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
from tom_plugin.pipeline import run_on_csv_data
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog='TOM-Plugin-Demo-Runner',
|
||||||
|
description='integration testing of provided pipelines in TOM-Plugin',
|
||||||
|
)
|
||||||
|
subparsers = parser.add_subparsers(dest='subparser')
|
||||||
|
parser_csv = subparsers.add_parser('csv', help='run on CSV data')
|
||||||
|
parser_csv.add_argument('id', help='ID for data set')
|
||||||
|
parser_csv.add_argument('filename', help='filename from configured input directory')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
if args.subparser == 'csv':
|
||||||
|
t1 = time.perf_counter()
|
||||||
|
run_on_csv_data(args.id, args.filename)
|
||||||
|
t2 = time.perf_counter()
|
||||||
|
run_time = t2 - t1
|
||||||
|
td = timedelta(seconds=run_time)
|
||||||
|
print(f'Application runtime was. {td}')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
2
src/tom_plugin/env_vars.txt
Normal file
2
src/tom_plugin/env_vars.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
# list of all library's environment variables
|
||||||
|
DOPT_TOM_PLUGIN_LIBRARY_USAGE : indicate that this wrapper application is in library mode (used to set different environment variables)
|
||||||
276
src/tom_plugin/pipeline.py
Normal file
276
src/tom_plugin/pipeline.py
Normal file
@ -0,0 +1,276 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import typing
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
|
from lang_main.analysis.graphs import (
|
||||||
|
Graph,
|
||||||
|
TokenGraph,
|
||||||
|
save_to_GraphML,
|
||||||
|
)
|
||||||
|
from lang_main.constants import (
|
||||||
|
CYTO_BASE_NETWORK_NAME,
|
||||||
|
INPUT_PATH_FOLDER,
|
||||||
|
SAVE_PATH_FOLDER,
|
||||||
|
SKIP_GRAPH_POSTPROCESSING,
|
||||||
|
SKIP_GRAPH_RESCALING,
|
||||||
|
SKIP_GRAPH_STATIC_RENDERING,
|
||||||
|
SKIP_PREPROCESSING,
|
||||||
|
SKIP_TIME_ANALYSIS,
|
||||||
|
SKIP_TOKEN_ANALYSIS,
|
||||||
|
)
|
||||||
|
from lang_main.errors import DependencyMissingError
|
||||||
|
from lang_main.io import create_saving_folder, get_entry_point, load_pickle
|
||||||
|
from lang_main.pipelines.base import Pipeline
|
||||||
|
from lang_main.pipelines.predefined import (
|
||||||
|
build_base_target_feature_pipe,
|
||||||
|
build_merge_duplicates_pipe,
|
||||||
|
build_timeline_pipe,
|
||||||
|
build_tk_graph_pipe,
|
||||||
|
build_tk_graph_post_pipe,
|
||||||
|
build_tk_graph_render_pipe,
|
||||||
|
build_tk_graph_rescaling_pipe,
|
||||||
|
)
|
||||||
|
from lang_main.types import (
|
||||||
|
EntryPoints,
|
||||||
|
ObjectID,
|
||||||
|
PandasIndex,
|
||||||
|
SpacyDoc,
|
||||||
|
TimelineCandidates,
|
||||||
|
)
|
||||||
|
from pandas import DataFrame
|
||||||
|
|
||||||
|
# ** build pipelines
|
||||||
|
pipe_target_feat_on_csv = build_base_target_feature_pipe()
|
||||||
|
pipe_merge = build_merge_duplicates_pipe()
|
||||||
|
pipe_token_analysis = build_tk_graph_pipe()
|
||||||
|
pipe_graph_postprocessing = build_tk_graph_post_pipe()
|
||||||
|
pipe_graph_rescaling = build_tk_graph_rescaling_pipe(
|
||||||
|
save_result=True,
|
||||||
|
exit_point=EntryPoints.TK_GRAPH_ANALYSIS_RESCALED,
|
||||||
|
)
|
||||||
|
pipe_timeline = build_timeline_pipe()
|
||||||
|
|
||||||
|
pipe_static_graph_rendering: Pipeline | None = None
|
||||||
|
# rendering depending on optional dependencies
|
||||||
|
try:
|
||||||
|
pipe_static_graph_rendering = build_tk_graph_render_pipe(
|
||||||
|
with_subgraphs=True,
|
||||||
|
base_network_name=CYTO_BASE_NETWORK_NAME,
|
||||||
|
)
|
||||||
|
except (ImportError, DependencyMissingError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
all_pipes: tuple[Pipeline | None, ...] = (
|
||||||
|
pipe_target_feat_on_csv,
|
||||||
|
pipe_merge,
|
||||||
|
pipe_token_analysis,
|
||||||
|
pipe_graph_postprocessing,
|
||||||
|
pipe_graph_rescaling,
|
||||||
|
pipe_static_graph_rendering,
|
||||||
|
pipe_timeline,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ENV variable: LANG_MAIN_SAVE_FOLDER : path for saving folder of current run
|
||||||
|
# ENV variable: LANG_MAIN_INPUT_DATA : path for input data of current run
|
||||||
|
def get_save_folder() -> Path:
|
||||||
|
save_folder_env = os.environ.get('LANG_MAIN_SAVE_FOLDER', None)
|
||||||
|
assert save_folder_env is not None, 'saving folder not defined as ENV variable'
|
||||||
|
save_folder = Path(save_folder_env)
|
||||||
|
assert save_folder.exists(), 'save folder does not exist'
|
||||||
|
|
||||||
|
return save_folder
|
||||||
|
|
||||||
|
|
||||||
|
def get_path_to_dataset() -> Path:
|
||||||
|
data_pth_env = os.environ.get('LANG_MAIN_INPUT_DATA', None)
|
||||||
|
assert data_pth_env is not None, 'path to dataset not defined as ENV variable'
|
||||||
|
data_pth = Path(data_pth_env)
|
||||||
|
assert data_pth.exists(), 'path to dataset does not exist'
|
||||||
|
|
||||||
|
return data_pth
|
||||||
|
|
||||||
|
|
||||||
|
def _set_save_folder(
|
||||||
|
target_folder: Path,
|
||||||
|
) -> None:
|
||||||
|
# save_folder = get_save_folder()
|
||||||
|
|
||||||
|
for pipe in all_pipes:
|
||||||
|
if pipe is not None:
|
||||||
|
pipe.working_dir = target_folder
|
||||||
|
|
||||||
|
|
||||||
|
# ** preparation
|
||||||
|
def _prepare_run_on_csv(
|
||||||
|
id: str,
|
||||||
|
filename: str,
|
||||||
|
) -> tuple[Path, Path]:
|
||||||
|
# output directory for intermediate results
|
||||||
|
print(f'Saving path: {SAVE_PATH_FOLDER}', flush=True)
|
||||||
|
target_folder = SAVE_PATH_FOLDER / id
|
||||||
|
create_saving_folder(
|
||||||
|
saving_path_folder=target_folder,
|
||||||
|
overwrite_existing=True,
|
||||||
|
)
|
||||||
|
assert target_folder.exists(), 'target folder not existing after creation'
|
||||||
|
# data set
|
||||||
|
data_pth = (INPUT_PATH_FOLDER / filename).with_suffix('.csv')
|
||||||
|
|
||||||
|
assert data_pth.exists(), 'path to data not existing'
|
||||||
|
assert data_pth.is_file(), 'data is not a file'
|
||||||
|
print(f'Data path: {data_pth}', flush=True)
|
||||||
|
|
||||||
|
return target_folder, data_pth
|
||||||
|
|
||||||
|
|
||||||
|
# ** preprocessing pipeline
|
||||||
|
def _run_preprocessing_on_csv(
|
||||||
|
target_folder: Path,
|
||||||
|
data_pth: Path,
|
||||||
|
) -> Path:
|
||||||
|
# data_pth = get_path_to_dataset()
|
||||||
|
# run pipelines
|
||||||
|
ret = typing.cast(
|
||||||
|
tuple[DataFrame], pipe_target_feat_on_csv.run(starting_values=(data_pth,))
|
||||||
|
)
|
||||||
|
target_feat_data = ret[0]
|
||||||
|
_ = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(target_feat_data,)))
|
||||||
|
|
||||||
|
return target_folder
|
||||||
|
|
||||||
|
|
||||||
|
# ** token analysis
|
||||||
|
def _run_token_analysis(
|
||||||
|
target_folder: Path,
|
||||||
|
) -> Path:
|
||||||
|
# load entry point
|
||||||
|
# save_folder = get_save_folder()
|
||||||
|
entry_point_path = get_entry_point(target_folder, EntryPoints.TOKEN_ANALYSIS)
|
||||||
|
loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
|
||||||
|
preprocessed_data = loaded_results[0]
|
||||||
|
# build token graph
|
||||||
|
(tk_graph, _) = typing.cast(
|
||||||
|
tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
|
||||||
|
pipe_token_analysis.run(starting_values=(preprocessed_data,)),
|
||||||
|
)
|
||||||
|
tk_graph.to_GraphML(target_folder, filename='TokenGraph', directed=False)
|
||||||
|
|
||||||
|
return target_folder
|
||||||
|
|
||||||
|
|
||||||
|
def _run_graph_postprocessing(
|
||||||
|
target_folder: Path,
|
||||||
|
) -> Path:
|
||||||
|
# load entry point
|
||||||
|
# save_folder = get_save_folder()
|
||||||
|
entry_point_path = get_entry_point(target_folder, EntryPoints.TK_GRAPH_POST)
|
||||||
|
loaded_results = cast(
|
||||||
|
tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
|
||||||
|
load_pickle(entry_point_path),
|
||||||
|
)
|
||||||
|
tk_graph = loaded_results[0]
|
||||||
|
# filter graph by edge weight and remove single nodes (no connection)
|
||||||
|
ret = cast(tuple[TokenGraph], pipe_graph_postprocessing.run(starting_values=(tk_graph,)))
|
||||||
|
tk_graph_filtered = ret[0]
|
||||||
|
tk_graph_filtered.to_GraphML(
|
||||||
|
target_folder, filename='TokenGraph-filtered', directed=False
|
||||||
|
)
|
||||||
|
|
||||||
|
return target_folder
|
||||||
|
|
||||||
|
|
||||||
|
def _run_graph_edge_rescaling(
|
||||||
|
target_folder: Path,
|
||||||
|
) -> Path:
|
||||||
|
# load entry point
|
||||||
|
# save_folder = get_save_folder()
|
||||||
|
entry_point_path = get_entry_point(target_folder, EntryPoints.TK_GRAPH_ANALYSIS)
|
||||||
|
loaded_results = cast(
|
||||||
|
tuple[TokenGraph],
|
||||||
|
load_pickle(entry_point_path),
|
||||||
|
)
|
||||||
|
tk_graph = loaded_results[0]
|
||||||
|
tk_graph_rescaled, tk_graph_rescaled_undirected = cast(
|
||||||
|
tuple[TokenGraph, Graph], pipe_graph_rescaling.run(starting_values=(tk_graph,))
|
||||||
|
)
|
||||||
|
tk_graph_rescaled.to_GraphML(
|
||||||
|
target_folder, filename='TokenGraph-directed-rescaled', directed=False
|
||||||
|
)
|
||||||
|
save_to_GraphML(
|
||||||
|
tk_graph_rescaled_undirected,
|
||||||
|
saving_path=target_folder,
|
||||||
|
filename='TokenGraph-undirected-rescaled',
|
||||||
|
)
|
||||||
|
|
||||||
|
return target_folder
|
||||||
|
|
||||||
|
|
||||||
|
def _run_static_graph_rendering(
|
||||||
|
target_folder: Path,
|
||||||
|
) -> Path:
|
||||||
|
# load entry point
|
||||||
|
# save_folder = get_save_folder()
|
||||||
|
entry_point_path = get_entry_point(
|
||||||
|
target_folder,
|
||||||
|
EntryPoints.TK_GRAPH_ANALYSIS_RESCALED,
|
||||||
|
)
|
||||||
|
loaded_results = cast(
|
||||||
|
tuple[TokenGraph, Graph],
|
||||||
|
load_pickle(entry_point_path),
|
||||||
|
)
|
||||||
|
_ = loaded_results[0]
|
||||||
|
tk_graph_rescaled_undirected = loaded_results[1]
|
||||||
|
|
||||||
|
if pipe_static_graph_rendering is not None:
|
||||||
|
_ = pipe_static_graph_rendering.run(starting_values=(tk_graph_rescaled_undirected,))
|
||||||
|
|
||||||
|
return target_folder
|
||||||
|
|
||||||
|
|
||||||
|
# ** time analysis
|
||||||
|
def _run_time_analysis(
|
||||||
|
target_folder: Path,
|
||||||
|
) -> Path:
|
||||||
|
# load entry point
|
||||||
|
# save_folder = get_save_folder()
|
||||||
|
entry_point_path = get_entry_point(target_folder, EntryPoints.TIMELINE)
|
||||||
|
loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
|
||||||
|
preprocessed_data = loaded_results[0]
|
||||||
|
|
||||||
|
_ = cast(
|
||||||
|
tuple[TimelineCandidates, dict[ObjectID, str]],
|
||||||
|
pipe_timeline.run(starting_values=(preprocessed_data,)),
|
||||||
|
)
|
||||||
|
|
||||||
|
return target_folder
|
||||||
|
|
||||||
|
|
||||||
|
def _build_pipeline_container(
|
||||||
|
target_folder: Path,
|
||||||
|
) -> Pipeline:
|
||||||
|
# save_folder = get_save_folder()
|
||||||
|
# container = PipelineContainer(name='Pipeline-Container-Base', working_dir=target_folder)
|
||||||
|
container = Pipeline(name='Pipeline-Base', working_dir=target_folder)
|
||||||
|
container.add(_run_preprocessing_on_csv, skip=SKIP_PREPROCESSING)
|
||||||
|
container.add(_run_token_analysis, skip=SKIP_TOKEN_ANALYSIS)
|
||||||
|
container.add(_run_graph_postprocessing, skip=SKIP_GRAPH_POSTPROCESSING)
|
||||||
|
container.add(_run_graph_edge_rescaling, skip=SKIP_GRAPH_RESCALING)
|
||||||
|
container.add(_run_static_graph_rendering, skip=SKIP_GRAPH_STATIC_RENDERING)
|
||||||
|
container.add(_run_time_analysis, skip=SKIP_TIME_ANALYSIS)
|
||||||
|
|
||||||
|
return container
|
||||||
|
|
||||||
|
|
||||||
|
def run_on_csv_data(
|
||||||
|
id: str,
|
||||||
|
filename: str,
|
||||||
|
) -> None:
|
||||||
|
target_folder, data_pth = _prepare_run_on_csv(id=id, filename=filename)
|
||||||
|
_set_save_folder(target_folder)
|
||||||
|
procedure = _build_pipeline_container(target_folder)
|
||||||
|
procedure.run(starting_values=(target_folder, data_pth))
|
||||||
21
test.py
Normal file
21
test.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
os.environ['LANG_MAIN_STOP_SEARCH_FOLDERNAME'] = 'python'
|
||||||
|
os.environ['LANG_MAIN_BASE_FOLDERNAME'] = 'bin'
|
||||||
|
|
||||||
|
from tom_plugin import pipeline
|
||||||
|
|
||||||
|
|
||||||
|
def run_pipe() -> None:
|
||||||
|
# lang-data\in\Dummy_Dataset_N_1000.csv
|
||||||
|
# relative_path = r'.\lang-data\in\Dummy_Dataset_N_1000.csv'
|
||||||
|
# absolute_path = r'A:\Arbeitsaufgaben\lang-data\in\Dummy_Dataset_N_1000.csv'
|
||||||
|
filename: str = 'Dummy_Dataset_N_1000.csv'
|
||||||
|
|
||||||
|
# pipeline.run_on_csv_data(id='123', filename=relative_path)
|
||||||
|
# pipeline.run_on_csv_data(id='124', filename=absolute_path)
|
||||||
|
pipeline.run_on_csv_data(id='1234', filename=filename)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
run_pipe()
|
||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
8
tom-plugin.code-workspace
Normal file
8
tom-plugin.code-workspace
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
{
|
||||||
|
"folders": [
|
||||||
|
{
|
||||||
|
"path": "."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"settings": {}
|
||||||
|
}
|
||||||
2
update_and_publish.ps1
Normal file
2
update_and_publish.ps1
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
pdm update -u -x lang-main
|
||||||
|
pdm publish -r local --skip-existing
|
||||||
1
update_lang_main.ps1
Normal file
1
update_lang_main.ps1
Normal file
@ -0,0 +1 @@
|
|||||||
|
pdm update -u -x lang-main
|
||||||
Loading…
x
Reference in New Issue
Block a user