initial commit V0.1.0
This commit is contained in:
commit
7786e2660c
162
.gitignore
vendored
Normal file
162
.gitignore
vendored
Normal file
@ -0,0 +1,162 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm-project.org/#use-with-ide
|
||||
.pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
5034
benchmark/model_tests.ipynb
Normal file
5034
benchmark/model_tests.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
57
benchmark/stfr.py
Normal file
57
benchmark/stfr.py
Normal file
@ -0,0 +1,57 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
os.environ['LANG_MAIN_STOP_SEARCH_FOLDERNAME'] = 'tom-plugin'
|
||||
os.environ['LANG_MAIN_BASE_FOLDERNAME'] = 'tom-plugin'
|
||||
|
||||
from lang_main.constants import SimilarityFunction
|
||||
from lang_main.model_loader import load_sentence_transformer as load_stfr
|
||||
from lang_main.types import (
|
||||
ONNXExecutionProvider,
|
||||
STFRBackends,
|
||||
STFRDeviceTypes,
|
||||
STFRModelArgs,
|
||||
TorchDTypes,
|
||||
)
|
||||
|
||||
MODEL_NAME = 'mixedbread-ai/deepset-mxbai-embed-de-large-v1'
|
||||
|
||||
MODEL_ARGS: STFRModelArgs = {
|
||||
# 'torch_dtype': 'float32',
|
||||
'export': False,
|
||||
# 'file_name': 'onnx/model_uint8.onnx', # type: ignore
|
||||
'file_name': 'onnx/model_quantized.onnx', # type: ignore
|
||||
'provider': ONNXExecutionProvider.CPU,
|
||||
}
|
||||
|
||||
MODEL_PATH = Path(r'A:\Arbeitsaufgaben\lang-models')
|
||||
|
||||
|
||||
def load_models(model_name: str, trust_remote: bool = False, use_onnx: bool = False):
|
||||
assert MODEL_PATH.exists(), 'model path not existing'
|
||||
if use_onnx:
|
||||
model_kwargs = MODEL_ARGS
|
||||
backend = STFRBackends.ONNX
|
||||
else:
|
||||
model_kwargs = {'torch_dtype': 'float32'}
|
||||
backend = STFRBackends.TORCH
|
||||
|
||||
stfr_model = load_stfr(
|
||||
model_name=model_name, # type: ignore
|
||||
similarity_func=SimilarityFunction.COSINE,
|
||||
backend=backend,
|
||||
local_files_only=False,
|
||||
trust_remote_code=trust_remote,
|
||||
model_save_folder=str(MODEL_PATH),
|
||||
model_kwargs=model_kwargs,
|
||||
)
|
||||
|
||||
return stfr_model
|
||||
|
||||
|
||||
def main():
|
||||
load_models(MODEL_NAME)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
2
benchmark/test.toml
Normal file
2
benchmark/test.toml
Normal file
@ -0,0 +1,2 @@
|
||||
[test]
|
||||
t1 = nan
|
||||
2
bump_prerelease_num.ps1
Normal file
2
bump_prerelease_num.ps1
Normal file
@ -0,0 +1,2 @@
|
||||
pdm run bump-my-version bump pre_n
|
||||
pdm run bump-my-version show current_version
|
||||
2
bump_release_type.ps1
Normal file
2
bump_release_type.ps1
Normal file
@ -0,0 +1,2 @@
|
||||
pdm run bump-my-version bump pre_l
|
||||
pdm run bump-my-version show current_version
|
||||
59
lang_main_config.toml
Normal file
59
lang_main_config.toml
Normal file
@ -0,0 +1,59 @@
|
||||
# d-opt -- lang_main: config file
|
||||
|
||||
[paths]
|
||||
inputs = './lang-data/in/'
|
||||
results = './lang-data/out/'
|
||||
models = './lang-models/converted'
|
||||
|
||||
[models]
|
||||
use_large_model = false
|
||||
|
||||
[logging]
|
||||
enabled = true
|
||||
stderr = true
|
||||
file = true
|
||||
|
||||
# control which pipelines are executed
|
||||
[control]
|
||||
preprocessing_skip = false
|
||||
token_analysis_skip = false
|
||||
graph_postprocessing_skip = false
|
||||
graph_rescaling_skip = false
|
||||
graph_static_rendering_skip = true
|
||||
time_analysis_skip = true
|
||||
|
||||
[preprocess]
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
target_feature = "VorgangsBeschreibung"
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.92
|
||||
|
||||
[graph_postprocessing]
|
||||
max_edge_number = -1
|
||||
|
||||
[time_analysis.uniqueness]
|
||||
threshold_unique_texts = 5
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
feature_name_obj_text = 'HObjektText'
|
||||
|
||||
[time_analysis.preparation]
|
||||
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
|
||||
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
|
||||
|
||||
[time_analysis.model_input]
|
||||
input_features = [
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
activity_feature = 'VorgangsTypName'
|
||||
activity_types = [
|
||||
'Reparaturauftrag (Portal)',
|
||||
'Störungsmeldung',
|
||||
]
|
||||
threshold_num_activities = 1
|
||||
threshold_similarity = 0.8
|
||||
56
lang_main_config_old.toml
Normal file
56
lang_main_config_old.toml
Normal file
@ -0,0 +1,56 @@
|
||||
# d-opt -- lang_main: config file
|
||||
|
||||
[paths]
|
||||
inputs = './lang-data/in/'
|
||||
results = './lang-data/out/'
|
||||
models = './lang-models/converted'
|
||||
|
||||
[logging]
|
||||
enabled = true
|
||||
stderr = true
|
||||
file = true
|
||||
|
||||
# control which pipelines are executed
|
||||
[control]
|
||||
preprocessing_skip = false
|
||||
token_analysis_skip = false
|
||||
graph_postprocessing_skip = false
|
||||
graph_rescaling_skip = false
|
||||
graph_static_rendering_skip = true
|
||||
time_analysis_skip = true
|
||||
|
||||
[preprocess]
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
target_feature = "VorgangsBeschreibung"
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.92
|
||||
|
||||
[graph_postprocessing]
|
||||
max_edge_number = -1
|
||||
|
||||
[time_analysis.uniqueness]
|
||||
threshold_unique_texts = 5
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
feature_name_obj_text = 'HObjektText'
|
||||
|
||||
[time_analysis.preparation]
|
||||
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
|
||||
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
|
||||
|
||||
[time_analysis.model_input]
|
||||
input_features = [
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
activity_feature = 'VorgangsTypName'
|
||||
activity_types = [
|
||||
'Reparaturauftrag (Portal)',
|
||||
'Störungsmeldung',
|
||||
]
|
||||
threshold_num_activities = 1
|
||||
threshold_similarity = 0.8
|
||||
63
lang_main_config_old2.toml
Normal file
63
lang_main_config_old2.toml
Normal file
@ -0,0 +1,63 @@
|
||||
# lang_main: Config file
|
||||
|
||||
[paths]
|
||||
inputs = './lang-data/in'
|
||||
# results = './results/dummy_N_1000/'
|
||||
# dataset = '../data/Dummy_Dataset_N_1000.csv'
|
||||
results = './lang-data/out'
|
||||
models = './lang-models/converted'
|
||||
|
||||
[logging]
|
||||
enabled = true
|
||||
stderr = true
|
||||
file = true
|
||||
|
||||
# control which pipelines are executed
|
||||
[control]
|
||||
preprocessing_skip = false
|
||||
token_analysis_skip = false
|
||||
graph_postprocessing_skip = false
|
||||
graph_rescaling_skip = false
|
||||
graph_static_rendering_skip = false
|
||||
time_analysis_skip = true
|
||||
|
||||
[preprocess]
|
||||
date_cols = [
|
||||
"VorgangsDatum",
|
||||
"ErledigungsDatum",
|
||||
"Arbeitsbeginn",
|
||||
"ErstellungsDatum",
|
||||
]
|
||||
threshold_amount_characters = 5
|
||||
threshold_similarity = 0.8
|
||||
|
||||
[graph_postprocessing]
|
||||
threshold_edge_number = 500
|
||||
# threshold_edge_weight = 150
|
||||
|
||||
[time_analysis.uniqueness]
|
||||
threshold_unique_texts = 4
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
feature_name_obj_text = 'HObjektText'
|
||||
|
||||
[time_analysis.preparation]
|
||||
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
|
||||
name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
|
||||
|
||||
[time_analysis.model_input]
|
||||
# input_features = [
|
||||
# 'VorgangsTypName',
|
||||
# 'VorgangsArtText',
|
||||
# 'VorgangsBeschreibung',
|
||||
# ]
|
||||
input_features = [
|
||||
'VorgangsBeschreibung',
|
||||
]
|
||||
activity_feature = 'VorgangsTypName'
|
||||
activity_types = [
|
||||
'Reparaturauftrag (Portal)',
|
||||
'Störungsmeldung',
|
||||
]
|
||||
threshold_num_acitivities = 1
|
||||
threshold_similarity = 0.8
|
||||
1
publish.ps1
Normal file
1
publish.ps1
Normal file
@ -0,0 +1 @@
|
||||
pdm publish -r local --skip-existing
|
||||
139
pyproject.toml
Normal file
139
pyproject.toml
Normal file
@ -0,0 +1,139 @@
|
||||
[project]
|
||||
name = "tom-plugin"
|
||||
version = "0.1.0"
|
||||
description = "Wrapper for TOM plugins with different helper CLIs, primarily integration testing"
|
||||
authors = [
|
||||
{name = "d-opt GmbH, resp. Florian Förster", email = "f.foerster@d-opt.com"},
|
||||
]
|
||||
dependencies = ["lang-main[spacy-md,spacy-trf]>=0.1.0"]
|
||||
requires-python = ">=3.11"
|
||||
readme = "README.md"
|
||||
license = {text = "MIT"}
|
||||
|
||||
[project.scripts]
|
||||
model-download = "tom_plugin._tools._load_model:main"
|
||||
pipeline-test = "tom_plugin._tools._run:main"
|
||||
|
||||
|
||||
[build-system]
|
||||
requires = ["pdm-backend"]
|
||||
build-backend = "pdm.backend"
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 94
|
||||
indent-width = 4
|
||||
target-version = "py311"
|
||||
src = ["src"]
|
||||
|
||||
[tool.ruff.format]
|
||||
quote-style = "single"
|
||||
skip-magic-trailing-comma = false
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = ["E", "F", "I"]
|
||||
|
||||
[tool.ruff.lint.isort]
|
||||
extra-standard-library = ["typing_extensions"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
addopts = [
|
||||
"-vvl",
|
||||
"--import-mode=importlib",
|
||||
]
|
||||
testpaths = [
|
||||
"tests",
|
||||
]
|
||||
filterwarnings = [
|
||||
'ignore:pkg_resources is deprecated as an API.:DeprecationWarning'
|
||||
]
|
||||
markers = [
|
||||
]
|
||||
log_cli = true
|
||||
|
||||
[tool.coverage.run]
|
||||
relative_files = true
|
||||
source = [
|
||||
"tom_plugin",
|
||||
"tests/",
|
||||
]
|
||||
|
||||
[tool.coverage.report]
|
||||
exclude_also = [
|
||||
"def __repr__",
|
||||
"def __str__",
|
||||
"@overload",
|
||||
"if logging",
|
||||
"if TYPE_CHECKING",
|
||||
"@pytest.fixture",
|
||||
"if __name__ == __main__:",
|
||||
]
|
||||
|
||||
[tool.coverage.html]
|
||||
directory = "reports/coverage"
|
||||
|
||||
|
||||
|
||||
[tool.pdm]
|
||||
distribution = true
|
||||
|
||||
[tool.pdm.resolution]
|
||||
respect-source-order = true
|
||||
|
||||
[[tool.pdm.source]]
|
||||
name = "private"
|
||||
url = "http://localhost:8001/simple"
|
||||
verify_ssl = false
|
||||
|
||||
[[tool.pdm.source]]
|
||||
name = "pypi"
|
||||
url = "https://pypi.org/simple"
|
||||
exclude_packages = ["lang-main*", "tom-plugin*"]
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"bump-my-version>=0.29.0",
|
||||
"jupyterlab>=4.3.4",
|
||||
"ipywidgets>=8.1.5",
|
||||
]
|
||||
|
||||
[tool.bumpversion]
|
||||
current_version = "0.1.0"
|
||||
parse = """(?x)
|
||||
(?P<major>0|[1-9]\\d*)\\.
|
||||
(?P<minor>0|[1-9]\\d*)\\.
|
||||
(?P<patch>0|[1-9]\\d*)
|
||||
(?:
|
||||
# separator for pre-release section
|
||||
(?P<pre_l>[a-zA-Z-]+) # pre-release label
|
||||
(?P<pre_n>0|[1-9]\\d*) # pre-release version number
|
||||
)? # pre-release section is optional
|
||||
"""
|
||||
serialize = [
|
||||
"{major}.{minor}.{patch}{pre_l}{pre_n}",
|
||||
"{major}.{minor}.{patch}",
|
||||
]
|
||||
search = "{current_version}"
|
||||
replace = "{new_version}"
|
||||
regex = false
|
||||
ignore_missing_version = false
|
||||
ignore_missing_files = false
|
||||
tag = false
|
||||
sign_tags = false
|
||||
tag_name = "v{new_version}"
|
||||
tag_message = "Bump version: {current_version} → {new_version}"
|
||||
allow_dirty = true
|
||||
commit = false
|
||||
message = "Bump version: {current_version} → {new_version}"
|
||||
commit_args = ""
|
||||
setup_hooks = []
|
||||
pre_commit_hooks = []
|
||||
post_commit_hooks = []
|
||||
|
||||
[tool.bumpversion.parts.pre_l]
|
||||
values = ["dev", "a", "b", "rc", "final"]
|
||||
optional_value = "final"
|
||||
|
||||
[[tool.bumpversion.files]]
|
||||
filename = "pyproject.toml"
|
||||
search = "version = \"{current_version}\""
|
||||
replace = "version = \"{new_version}\""
|
||||
3
src/tom_plugin/__init__.py
Normal file
3
src/tom_plugin/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
from tom_plugin import _env_vars as env
|
||||
|
||||
env.set()
|
||||
46
src/tom_plugin/_env_vars.py
Normal file
46
src/tom_plugin/_env_vars.py
Normal file
@ -0,0 +1,46 @@
|
||||
import os
|
||||
from typing import Final
|
||||
|
||||
|
||||
# ** ENV VARS
|
||||
def set() -> None:
|
||||
library_mode = os.environ.get('DOPT_TOM_PLUGIN_LIBRARY_USAGE', None)
|
||||
LIBRARY_MODE: Final[bool] = bool(library_mode)
|
||||
|
||||
if LIBRARY_MODE:
|
||||
_set_lib_mode()
|
||||
else:
|
||||
_set_app_mode(
|
||||
spacy_model=None,
|
||||
STFR_model=None,
|
||||
)
|
||||
|
||||
|
||||
def _set_lib_mode() -> None:
|
||||
os.environ['LANG_MAIN_STFR_BACKEND'] = 'onnx'
|
||||
|
||||
|
||||
def _set_app_mode(
|
||||
spacy_model: str | None = None,
|
||||
STFR_model: str | None = None,
|
||||
) -> None:
|
||||
os.environ['LANG_MAIN_STFR_BACKEND'] = 'onnx'
|
||||
os.environ['LANG_MAIN_STOP_SEARCH_FOLDERNAME'] = 'tom-plugin'
|
||||
os.environ['LANG_MAIN_BASE_FOLDERNAME'] = 'tom-plugin'
|
||||
|
||||
if spacy_model is not None:
|
||||
_set_spacy_model(spacy_model)
|
||||
if STFR_model is not None:
|
||||
_set_STFR_model(STFR_model)
|
||||
|
||||
|
||||
def _set_spacy_model(
|
||||
model_name: str = 'de_core_news_md',
|
||||
) -> None:
|
||||
os.environ['LANG_MAIN_SPACY_MODEL'] = model_name
|
||||
|
||||
|
||||
def _set_STFR_model(
|
||||
model_name: str = 'all-mpnet-base-v2',
|
||||
) -> None:
|
||||
os.environ['LANG_MAIN_SPACY_MODEL'] = model_name
|
||||
0
src/tom_plugin/_tools/__init__.py
Normal file
0
src/tom_plugin/_tools/__init__.py
Normal file
194
src/tom_plugin/_tools/_load_model.py
Normal file
194
src/tom_plugin/_tools/_load_model.py
Normal file
@ -0,0 +1,194 @@
|
||||
import argparse
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, cast
|
||||
|
||||
from lang_main.constants import stfr_model_args_default
|
||||
from lang_main.model_loader import (
|
||||
MODEL_BASE_FOLDER,
|
||||
STFR_BACKEND,
|
||||
STFR_DEVICE,
|
||||
STFR_MODEL_ARGS,
|
||||
STFR_MODEL_NAME,
|
||||
STFR_SIMILARITY,
|
||||
load_sentence_transformer,
|
||||
)
|
||||
from lang_main.types import (
|
||||
SentenceTransformer,
|
||||
STFRBackends,
|
||||
STFRModelArgs,
|
||||
STFRONNXFilenames,
|
||||
)
|
||||
from sentence_transformers.backend import (
|
||||
export_dynamic_quantized_onnx_model,
|
||||
export_optimized_onnx_model,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TypedArgumentParser:
|
||||
default: bool
|
||||
model: str
|
||||
path: Path | None
|
||||
convert: bool
|
||||
optim: bool
|
||||
quant: bool
|
||||
|
||||
|
||||
def get_model_name_from_repo(
|
||||
full_model_name: str,
|
||||
) -> str:
|
||||
return full_model_name.split('/')[-1]
|
||||
|
||||
|
||||
def _preload_STFR_model(
|
||||
model_name_repo: str,
|
||||
backend: STFRBackends,
|
||||
model_kwargs: STFRModelArgs | dict[str, Any] | None,
|
||||
target_folder: Path | str | None,
|
||||
) -> SentenceTransformer:
|
||||
save_folder: str | None = None
|
||||
if target_folder is not None:
|
||||
save_folder = str(target_folder)
|
||||
|
||||
return load_sentence_transformer(
|
||||
model_name=model_name_repo,
|
||||
similarity_func=STFR_SIMILARITY,
|
||||
backend=backend,
|
||||
device=STFR_DEVICE,
|
||||
model_kwargs=model_kwargs,
|
||||
model_save_folder=save_folder,
|
||||
local_files_only=False,
|
||||
force_download=True,
|
||||
)
|
||||
|
||||
|
||||
def _load_config_STFR_model() -> None:
|
||||
_ = _preload_STFR_model(
|
||||
model_name_repo=STFR_MODEL_NAME,
|
||||
backend=STFR_BACKEND,
|
||||
model_kwargs=STFR_MODEL_ARGS,
|
||||
target_folder=None,
|
||||
)
|
||||
|
||||
|
||||
def _model_conversion(
|
||||
model_name_repo: str,
|
||||
quant: bool,
|
||||
optimise: bool,
|
||||
target_folder: Path | None,
|
||||
) -> None:
|
||||
model_name = get_model_name_from_repo(model_name_repo)
|
||||
base_folder: Path = MODEL_BASE_FOLDER
|
||||
if target_folder is not None:
|
||||
base_folder = target_folder
|
||||
|
||||
if base_folder.stem == 'converted':
|
||||
export_folder = (base_folder / model_name).resolve()
|
||||
else:
|
||||
export_folder = (base_folder / 'converted' / model_name).resolve()
|
||||
|
||||
# attempt to download base model if not present
|
||||
_ = _preload_STFR_model(
|
||||
model_name_repo=model_name_repo,
|
||||
backend=STFRBackends.TORCH,
|
||||
model_kwargs=stfr_model_args_default,
|
||||
target_folder=base_folder,
|
||||
)
|
||||
|
||||
model_onnx = _preload_STFR_model(
|
||||
model_name_repo=model_name_repo,
|
||||
backend=STFRBackends.ONNX,
|
||||
model_kwargs=None,
|
||||
target_folder=base_folder,
|
||||
)
|
||||
model_onnx.save_pretrained(path=str(export_folder), safe_serialization=True)
|
||||
path_export_onnx_base = export_folder / 'onnx' / 'model.onnx'
|
||||
assert path_export_onnx_base.exists(), 'ONNX base weights not existing'
|
||||
print(f'Saved converted ONNX model under: {path_export_onnx_base}')
|
||||
|
||||
if quant:
|
||||
export_dynamic_quantized_onnx_model(
|
||||
model_onnx, quantization_config='avx2', model_name_or_path=str(export_folder)
|
||||
)
|
||||
path_export_onnx_quant = export_folder / STFRONNXFilenames.ONNX_Q_UINT8
|
||||
assert path_export_onnx_quant.exists(), 'ONNX quant weights not existing'
|
||||
print(f'Saved quantised ONNX model under: {path_export_onnx_quant}')
|
||||
os.remove(path_export_onnx_base)
|
||||
if optimise:
|
||||
export_optimized_onnx_model(
|
||||
model_onnx, optimization_config='O3', model_name_or_path=str(export_folder)
|
||||
)
|
||||
path_export_onnx_optim = export_folder / STFRONNXFilenames.ONNX_OPT_O3
|
||||
assert path_export_onnx_optim.exists(), 'ONNX optimised weights not existing'
|
||||
print(f'Saved optimised ONNX model under: {path_export_onnx_optim}')
|
||||
os.remove(path_export_onnx_base)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='STFR-Model-Loader',
|
||||
description=(
|
||||
'Helper program to pre-download SentenceTransformer models '
|
||||
'and convert them to different formats if desired'
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
'-d', '--default', action='store_true', help='load model from default config'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-m',
|
||||
'--model',
|
||||
default=STFR_MODEL_NAME,
|
||||
help='model to load (full repo name from Hugging Face Hub)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'-p', '--path', type=Path, default=None, help='path to save models to'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-c', '--convert', action='store_true', help='convert model to ONNX format'
|
||||
)
|
||||
parser.add_argument(
|
||||
'-o',
|
||||
'--optim',
|
||||
action='store_true',
|
||||
help=(
|
||||
'optimise ONNX model with O3 profile, , model is '
|
||||
'always converted to ONNX beforehand'
|
||||
),
|
||||
)
|
||||
# parser.add_argument('--onnx', action='store_true', help='use ONNX backend')
|
||||
parser.add_argument(
|
||||
'--quant',
|
||||
action='store_true',
|
||||
help=(
|
||||
'quantise model with "AVX2" configuration, model is always '
|
||||
'converted to ONNX beforehand'
|
||||
),
|
||||
)
|
||||
|
||||
args = cast(TypedArgumentParser, parser.parse_args())
|
||||
use_default_model = args.default
|
||||
convert_model = args.convert
|
||||
optimise_model = args.optim
|
||||
quantise_model = args.quant
|
||||
|
||||
if use_default_model and convert_model:
|
||||
raise ValueError('Loading default model does not allow model conversion')
|
||||
|
||||
path_models: Path | None = None
|
||||
if args.path is not None:
|
||||
path_models = args.path.resolve()
|
||||
assert path_models.exists(), 'model saving path not existing'
|
||||
assert path_models.is_dir(), 'model saving path not a directory'
|
||||
|
||||
if args.default:
|
||||
_load_config_STFR_model()
|
||||
else:
|
||||
_model_conversion(
|
||||
model_name_repo=args.model,
|
||||
quant=quantise_model,
|
||||
optimise=optimise_model,
|
||||
target_folder=path_models,
|
||||
)
|
||||
29
src/tom_plugin/_tools/_run.py
Normal file
29
src/tom_plugin/_tools/_run.py
Normal file
@ -0,0 +1,29 @@
|
||||
import argparse
|
||||
import time
|
||||
from datetime import timedelta
|
||||
|
||||
from tom_plugin.pipeline import run_on_csv_data
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='TOM-Plugin-Demo-Runner',
|
||||
description='integration testing of provided pipelines in TOM-Plugin',
|
||||
)
|
||||
subparsers = parser.add_subparsers(dest='subparser')
|
||||
parser_csv = subparsers.add_parser('csv', help='run on CSV data')
|
||||
parser_csv.add_argument('id', help='ID for data set')
|
||||
parser_csv.add_argument('filename', help='filename from configured input directory')
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.subparser == 'csv':
|
||||
t1 = time.perf_counter()
|
||||
run_on_csv_data(args.id, args.filename)
|
||||
t2 = time.perf_counter()
|
||||
run_time = t2 - t1
|
||||
td = timedelta(seconds=run_time)
|
||||
print(f'Application runtime was. {td}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
2
src/tom_plugin/env_vars.txt
Normal file
2
src/tom_plugin/env_vars.txt
Normal file
@ -0,0 +1,2 @@
|
||||
# list of all library's environment variables
|
||||
DOPT_TOM_PLUGIN_LIBRARY_USAGE : indicate that this wrapper application is in library mode (used to set different environment variables)
|
||||
276
src/tom_plugin/pipeline.py
Normal file
276
src/tom_plugin/pipeline.py
Normal file
@ -0,0 +1,276 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import typing
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
from lang_main.analysis.graphs import (
|
||||
Graph,
|
||||
TokenGraph,
|
||||
save_to_GraphML,
|
||||
)
|
||||
from lang_main.constants import (
|
||||
CYTO_BASE_NETWORK_NAME,
|
||||
INPUT_PATH_FOLDER,
|
||||
SAVE_PATH_FOLDER,
|
||||
SKIP_GRAPH_POSTPROCESSING,
|
||||
SKIP_GRAPH_RESCALING,
|
||||
SKIP_GRAPH_STATIC_RENDERING,
|
||||
SKIP_PREPROCESSING,
|
||||
SKIP_TIME_ANALYSIS,
|
||||
SKIP_TOKEN_ANALYSIS,
|
||||
)
|
||||
from lang_main.errors import DependencyMissingError
|
||||
from lang_main.io import create_saving_folder, get_entry_point, load_pickle
|
||||
from lang_main.pipelines.base import Pipeline
|
||||
from lang_main.pipelines.predefined import (
|
||||
build_base_target_feature_pipe,
|
||||
build_merge_duplicates_pipe,
|
||||
build_timeline_pipe,
|
||||
build_tk_graph_pipe,
|
||||
build_tk_graph_post_pipe,
|
||||
build_tk_graph_render_pipe,
|
||||
build_tk_graph_rescaling_pipe,
|
||||
)
|
||||
from lang_main.types import (
|
||||
EntryPoints,
|
||||
ObjectID,
|
||||
PandasIndex,
|
||||
SpacyDoc,
|
||||
TimelineCandidates,
|
||||
)
|
||||
from pandas import DataFrame
|
||||
|
||||
# ** build pipelines
|
||||
pipe_target_feat_on_csv = build_base_target_feature_pipe()
|
||||
pipe_merge = build_merge_duplicates_pipe()
|
||||
pipe_token_analysis = build_tk_graph_pipe()
|
||||
pipe_graph_postprocessing = build_tk_graph_post_pipe()
|
||||
pipe_graph_rescaling = build_tk_graph_rescaling_pipe(
|
||||
save_result=True,
|
||||
exit_point=EntryPoints.TK_GRAPH_ANALYSIS_RESCALED,
|
||||
)
|
||||
pipe_timeline = build_timeline_pipe()
|
||||
|
||||
pipe_static_graph_rendering: Pipeline | None = None
|
||||
# rendering depending on optional dependencies
|
||||
try:
|
||||
pipe_static_graph_rendering = build_tk_graph_render_pipe(
|
||||
with_subgraphs=True,
|
||||
base_network_name=CYTO_BASE_NETWORK_NAME,
|
||||
)
|
||||
except (ImportError, DependencyMissingError):
|
||||
pass
|
||||
|
||||
|
||||
all_pipes: tuple[Pipeline | None, ...] = (
|
||||
pipe_target_feat_on_csv,
|
||||
pipe_merge,
|
||||
pipe_token_analysis,
|
||||
pipe_graph_postprocessing,
|
||||
pipe_graph_rescaling,
|
||||
pipe_static_graph_rendering,
|
||||
pipe_timeline,
|
||||
)
|
||||
|
||||
|
||||
# ENV variable: LANG_MAIN_SAVE_FOLDER : path for saving folder of current run
|
||||
# ENV variable: LANG_MAIN_INPUT_DATA : path for input data of current run
|
||||
def get_save_folder() -> Path:
|
||||
save_folder_env = os.environ.get('LANG_MAIN_SAVE_FOLDER', None)
|
||||
assert save_folder_env is not None, 'saving folder not defined as ENV variable'
|
||||
save_folder = Path(save_folder_env)
|
||||
assert save_folder.exists(), 'save folder does not exist'
|
||||
|
||||
return save_folder
|
||||
|
||||
|
||||
def get_path_to_dataset() -> Path:
|
||||
data_pth_env = os.environ.get('LANG_MAIN_INPUT_DATA', None)
|
||||
assert data_pth_env is not None, 'path to dataset not defined as ENV variable'
|
||||
data_pth = Path(data_pth_env)
|
||||
assert data_pth.exists(), 'path to dataset does not exist'
|
||||
|
||||
return data_pth
|
||||
|
||||
|
||||
def _set_save_folder(
|
||||
target_folder: Path,
|
||||
) -> None:
|
||||
# save_folder = get_save_folder()
|
||||
|
||||
for pipe in all_pipes:
|
||||
if pipe is not None:
|
||||
pipe.working_dir = target_folder
|
||||
|
||||
|
||||
# ** preparation
|
||||
def _prepare_run_on_csv(
|
||||
id: str,
|
||||
filename: str,
|
||||
) -> tuple[Path, Path]:
|
||||
# output directory for intermediate results
|
||||
print(f'Saving path: {SAVE_PATH_FOLDER}', flush=True)
|
||||
target_folder = SAVE_PATH_FOLDER / id
|
||||
create_saving_folder(
|
||||
saving_path_folder=target_folder,
|
||||
overwrite_existing=True,
|
||||
)
|
||||
assert target_folder.exists(), 'target folder not existing after creation'
|
||||
# data set
|
||||
data_pth = (INPUT_PATH_FOLDER / filename).with_suffix('.csv')
|
||||
|
||||
assert data_pth.exists(), 'path to data not existing'
|
||||
assert data_pth.is_file(), 'data is not a file'
|
||||
print(f'Data path: {data_pth}', flush=True)
|
||||
|
||||
return target_folder, data_pth
|
||||
|
||||
|
||||
# ** preprocessing pipeline
|
||||
def _run_preprocessing_on_csv(
|
||||
target_folder: Path,
|
||||
data_pth: Path,
|
||||
) -> Path:
|
||||
# data_pth = get_path_to_dataset()
|
||||
# run pipelines
|
||||
ret = typing.cast(
|
||||
tuple[DataFrame], pipe_target_feat_on_csv.run(starting_values=(data_pth,))
|
||||
)
|
||||
target_feat_data = ret[0]
|
||||
_ = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(target_feat_data,)))
|
||||
|
||||
return target_folder
|
||||
|
||||
|
||||
# ** token analysis
|
||||
def _run_token_analysis(
|
||||
target_folder: Path,
|
||||
) -> Path:
|
||||
# load entry point
|
||||
# save_folder = get_save_folder()
|
||||
entry_point_path = get_entry_point(target_folder, EntryPoints.TOKEN_ANALYSIS)
|
||||
loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
|
||||
preprocessed_data = loaded_results[0]
|
||||
# build token graph
|
||||
(tk_graph, _) = typing.cast(
|
||||
tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
|
||||
pipe_token_analysis.run(starting_values=(preprocessed_data,)),
|
||||
)
|
||||
tk_graph.to_GraphML(target_folder, filename='TokenGraph', directed=False)
|
||||
|
||||
return target_folder
|
||||
|
||||
|
||||
def _run_graph_postprocessing(
|
||||
target_folder: Path,
|
||||
) -> Path:
|
||||
# load entry point
|
||||
# save_folder = get_save_folder()
|
||||
entry_point_path = get_entry_point(target_folder, EntryPoints.TK_GRAPH_POST)
|
||||
loaded_results = cast(
|
||||
tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
|
||||
load_pickle(entry_point_path),
|
||||
)
|
||||
tk_graph = loaded_results[0]
|
||||
# filter graph by edge weight and remove single nodes (no connection)
|
||||
ret = cast(tuple[TokenGraph], pipe_graph_postprocessing.run(starting_values=(tk_graph,)))
|
||||
tk_graph_filtered = ret[0]
|
||||
tk_graph_filtered.to_GraphML(
|
||||
target_folder, filename='TokenGraph-filtered', directed=False
|
||||
)
|
||||
|
||||
return target_folder
|
||||
|
||||
|
||||
def _run_graph_edge_rescaling(
|
||||
target_folder: Path,
|
||||
) -> Path:
|
||||
# load entry point
|
||||
# save_folder = get_save_folder()
|
||||
entry_point_path = get_entry_point(target_folder, EntryPoints.TK_GRAPH_ANALYSIS)
|
||||
loaded_results = cast(
|
||||
tuple[TokenGraph],
|
||||
load_pickle(entry_point_path),
|
||||
)
|
||||
tk_graph = loaded_results[0]
|
||||
tk_graph_rescaled, tk_graph_rescaled_undirected = cast(
|
||||
tuple[TokenGraph, Graph], pipe_graph_rescaling.run(starting_values=(tk_graph,))
|
||||
)
|
||||
tk_graph_rescaled.to_GraphML(
|
||||
target_folder, filename='TokenGraph-directed-rescaled', directed=False
|
||||
)
|
||||
save_to_GraphML(
|
||||
tk_graph_rescaled_undirected,
|
||||
saving_path=target_folder,
|
||||
filename='TokenGraph-undirected-rescaled',
|
||||
)
|
||||
|
||||
return target_folder
|
||||
|
||||
|
||||
def _run_static_graph_rendering(
|
||||
target_folder: Path,
|
||||
) -> Path:
|
||||
# load entry point
|
||||
# save_folder = get_save_folder()
|
||||
entry_point_path = get_entry_point(
|
||||
target_folder,
|
||||
EntryPoints.TK_GRAPH_ANALYSIS_RESCALED,
|
||||
)
|
||||
loaded_results = cast(
|
||||
tuple[TokenGraph, Graph],
|
||||
load_pickle(entry_point_path),
|
||||
)
|
||||
_ = loaded_results[0]
|
||||
tk_graph_rescaled_undirected = loaded_results[1]
|
||||
|
||||
if pipe_static_graph_rendering is not None:
|
||||
_ = pipe_static_graph_rendering.run(starting_values=(tk_graph_rescaled_undirected,))
|
||||
|
||||
return target_folder
|
||||
|
||||
|
||||
# ** time analysis
|
||||
def _run_time_analysis(
|
||||
target_folder: Path,
|
||||
) -> Path:
|
||||
# load entry point
|
||||
# save_folder = get_save_folder()
|
||||
entry_point_path = get_entry_point(target_folder, EntryPoints.TIMELINE)
|
||||
loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
|
||||
preprocessed_data = loaded_results[0]
|
||||
|
||||
_ = cast(
|
||||
tuple[TimelineCandidates, dict[ObjectID, str]],
|
||||
pipe_timeline.run(starting_values=(preprocessed_data,)),
|
||||
)
|
||||
|
||||
return target_folder
|
||||
|
||||
|
||||
def _build_pipeline_container(
|
||||
target_folder: Path,
|
||||
) -> Pipeline:
|
||||
# save_folder = get_save_folder()
|
||||
# container = PipelineContainer(name='Pipeline-Container-Base', working_dir=target_folder)
|
||||
container = Pipeline(name='Pipeline-Base', working_dir=target_folder)
|
||||
container.add(_run_preprocessing_on_csv, skip=SKIP_PREPROCESSING)
|
||||
container.add(_run_token_analysis, skip=SKIP_TOKEN_ANALYSIS)
|
||||
container.add(_run_graph_postprocessing, skip=SKIP_GRAPH_POSTPROCESSING)
|
||||
container.add(_run_graph_edge_rescaling, skip=SKIP_GRAPH_RESCALING)
|
||||
container.add(_run_static_graph_rendering, skip=SKIP_GRAPH_STATIC_RENDERING)
|
||||
container.add(_run_time_analysis, skip=SKIP_TIME_ANALYSIS)
|
||||
|
||||
return container
|
||||
|
||||
|
||||
def run_on_csv_data(
|
||||
id: str,
|
||||
filename: str,
|
||||
) -> None:
|
||||
target_folder, data_pth = _prepare_run_on_csv(id=id, filename=filename)
|
||||
_set_save_folder(target_folder)
|
||||
procedure = _build_pipeline_container(target_folder)
|
||||
procedure.run(starting_values=(target_folder, data_pth))
|
||||
21
test.py
Normal file
21
test.py
Normal file
@ -0,0 +1,21 @@
|
||||
import os
|
||||
|
||||
os.environ['LANG_MAIN_STOP_SEARCH_FOLDERNAME'] = 'python'
|
||||
os.environ['LANG_MAIN_BASE_FOLDERNAME'] = 'bin'
|
||||
|
||||
from tom_plugin import pipeline
|
||||
|
||||
|
||||
def run_pipe() -> None:
|
||||
# lang-data\in\Dummy_Dataset_N_1000.csv
|
||||
# relative_path = r'.\lang-data\in\Dummy_Dataset_N_1000.csv'
|
||||
# absolute_path = r'A:\Arbeitsaufgaben\lang-data\in\Dummy_Dataset_N_1000.csv'
|
||||
filename: str = 'Dummy_Dataset_N_1000.csv'
|
||||
|
||||
# pipeline.run_on_csv_data(id='123', filename=relative_path)
|
||||
# pipeline.run_on_csv_data(id='124', filename=absolute_path)
|
||||
pipeline.run_on_csv_data(id='1234', filename=filename)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run_pipe()
|
||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
8
tom-plugin.code-workspace
Normal file
8
tom-plugin.code-workspace
Normal file
@ -0,0 +1,8 @@
|
||||
{
|
||||
"folders": [
|
||||
{
|
||||
"path": "."
|
||||
}
|
||||
],
|
||||
"settings": {}
|
||||
}
|
||||
2
update_and_publish.ps1
Normal file
2
update_and_publish.ps1
Normal file
@ -0,0 +1,2 @@
|
||||
pdm update -u -x lang-main
|
||||
pdm publish -r local --skip-existing
|
||||
1
update_lang_main.ps1
Normal file
1
update_lang_main.ps1
Normal file
@ -0,0 +1 @@
|
||||
pdm update -u -x lang-main
|
||||
Loading…
x
Reference in New Issue
Block a user