initial commit V0.1.0

2025-01-23 12:05:13 +01:00 · 2025-01-23 12:05:13 +01:00 · 7786e2660c
commit 7786e2660c
26 changed files with 10127 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,162 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm-project.org/#use-with-ide
 .pdm.toml
 .pdm-python
 .pdm-build/
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
--- a/README.md
+++ b/README.md
@ -0,0 +1 @@
 # tom-plugin
--- a/benchmark/model_tests.ipynb
+++ b/benchmark/model_tests.ipynb
--- a/benchmark/stfr.py
+++ b/benchmark/stfr.py
@ -0,0 +1,57 @@
 import os
 from pathlib import Path
 os.environ['LANG_MAIN_STOP_SEARCH_FOLDERNAME'] = 'tom-plugin'
 os.environ['LANG_MAIN_BASE_FOLDERNAME'] = 'tom-plugin'
 from lang_main.constants import SimilarityFunction
 from lang_main.model_loader import load_sentence_transformer as load_stfr
 from lang_main.types import (
    ONNXExecutionProvider,
    STFRBackends,
    STFRDeviceTypes,
    STFRModelArgs,
    TorchDTypes,
 )
 MODEL_NAME = 'mixedbread-ai/deepset-mxbai-embed-de-large-v1'
 MODEL_ARGS: STFRModelArgs = {
    # 'torch_dtype': 'float32',
    'export': False,
    # 'file_name': 'onnx/model_uint8.onnx',  # type: ignore
    'file_name': 'onnx/model_quantized.onnx',  # type: ignore
    'provider': ONNXExecutionProvider.CPU,
 }
 MODEL_PATH = Path(r'A:\Arbeitsaufgaben\lang-models')
 def load_models(model_name: str, trust_remote: bool = False, use_onnx: bool = False):
    assert MODEL_PATH.exists(), 'model path not existing'
    if use_onnx:
        model_kwargs = MODEL_ARGS
        backend = STFRBackends.ONNX
    else:
        model_kwargs = {'torch_dtype': 'float32'}
        backend = STFRBackends.TORCH
    stfr_model = load_stfr(
        model_name=model_name,  # type: ignore
        similarity_func=SimilarityFunction.COSINE,
        backend=backend,
        local_files_only=False,
        trust_remote_code=trust_remote,
        model_save_folder=str(MODEL_PATH),
        model_kwargs=model_kwargs,
    )
    return stfr_model
 def main():
    load_models(MODEL_NAME)
 if __name__ == '__main__':
    main()
--- a/benchmark/test.toml
+++ b/benchmark/test.toml
@ -0,0 +1,2 @@
 [test]
 t1 = nan
--- a/build.ps1
+++ b/build.ps1
@ -0,0 +1 @@
 pdm build -d build/
--- a/bump_prerelease_num.ps1
+++ b/bump_prerelease_num.ps1
@ -0,0 +1,2 @@
 pdm run bump-my-version bump pre_n
 pdm run bump-my-version show current_version
--- a/bump_release_type.ps1
+++ b/bump_release_type.ps1
@ -0,0 +1,2 @@
 pdm run bump-my-version bump pre_l
 pdm run bump-my-version show current_version
--- a/lang_main_config.toml
+++ b/lang_main_config.toml
@ -0,0 +1,59 @@
 # d-opt -- lang_main: config file
 [paths]
 inputs = './lang-data/in/'
 results = './lang-data/out/'
 models = './lang-models/converted'
 [models]
 use_large_model = false
 [logging]
 enabled = true
 stderr = true
 file = true
 # control which pipelines are executed
 [control]
 preprocessing_skip = false
 token_analysis_skip = false
 graph_postprocessing_skip = false
 graph_rescaling_skip = false
 graph_static_rendering_skip = true
 time_analysis_skip = true
 [preprocess]
 date_cols = [
    "VorgangsDatum",
    "ErledigungsDatum",
    "Arbeitsbeginn",
    "ErstellungsDatum",
 ]
 target_feature = "VorgangsBeschreibung"
 threshold_amount_characters = 5
 threshold_similarity = 0.92
 [graph_postprocessing]
 max_edge_number = -1
 [time_analysis.uniqueness]
 threshold_unique_texts = 5
 criterion_feature = 'HObjektText'
 feature_name_obj_id = 'ObjektID'
 feature_name_obj_text = 'HObjektText'
 [time_analysis.preparation]
 name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
 name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
 [time_analysis.model_input]
 input_features = [
    'VorgangsBeschreibung',
 ]
 activity_feature = 'VorgangsTypName'
 activity_types = [
    'Reparaturauftrag (Portal)',
    'Störungsmeldung',
 ]
 threshold_num_activities = 1
 threshold_similarity = 0.8
--- a/lang_main_config_old.toml
+++ b/lang_main_config_old.toml
@ -0,0 +1,56 @@
 # d-opt -- lang_main: config file
 [paths]
 inputs = './lang-data/in/'
 results = './lang-data/out/'
 models = './lang-models/converted'
 [logging]
 enabled = true
 stderr = true
 file = true
 # control which pipelines are executed
 [control]
 preprocessing_skip = false
 token_analysis_skip = false
 graph_postprocessing_skip = false
 graph_rescaling_skip = false
 graph_static_rendering_skip = true
 time_analysis_skip = true
 [preprocess]
 date_cols = [
    "VorgangsDatum",
    "ErledigungsDatum",
    "Arbeitsbeginn",
    "ErstellungsDatum",
 ]
 target_feature = "VorgangsBeschreibung"
 threshold_amount_characters = 5
 threshold_similarity = 0.92
 [graph_postprocessing]
 max_edge_number = -1
 [time_analysis.uniqueness]
 threshold_unique_texts = 5
 criterion_feature = 'HObjektText'
 feature_name_obj_id = 'ObjektID'
 feature_name_obj_text = 'HObjektText'
 [time_analysis.preparation]
 name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
 name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
 [time_analysis.model_input]
 input_features = [
    'VorgangsBeschreibung',
 ]
 activity_feature = 'VorgangsTypName'
 activity_types = [
    'Reparaturauftrag (Portal)',
    'Störungsmeldung',
 ]
 threshold_num_activities = 1
 threshold_similarity = 0.8
--- a/lang_main_config_old2.toml
+++ b/lang_main_config_old2.toml
@ -0,0 +1,63 @@
 # lang_main: Config file
 [paths]
 inputs = './lang-data/in'
 # results = './results/dummy_N_1000/'
 # dataset = '../data/Dummy_Dataset_N_1000.csv'
 results = './lang-data/out'
 models = './lang-models/converted'
 [logging]
 enabled = true
 stderr = true
 file = true
 # control which pipelines are executed
 [control]
 preprocessing_skip = false
 token_analysis_skip = false
 graph_postprocessing_skip = false
 graph_rescaling_skip = false
 graph_static_rendering_skip = false
 time_analysis_skip = true
 [preprocess]
 date_cols = [
    "VorgangsDatum",
    "ErledigungsDatum",
    "Arbeitsbeginn",
    "ErstellungsDatum",
 ]
 threshold_amount_characters = 5
 threshold_similarity = 0.8
 [graph_postprocessing]
 threshold_edge_number = 500
 # threshold_edge_weight = 150
 [time_analysis.uniqueness]
 threshold_unique_texts = 4
 criterion_feature = 'HObjektText'
 feature_name_obj_id = 'ObjektID'
 feature_name_obj_text = 'HObjektText'
 [time_analysis.preparation]
 name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
 name_delta_feat_to_next_failure = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
 [time_analysis.model_input]
 # input_features = [
 #     'VorgangsTypName',
 #     'VorgangsArtText',
 #     'VorgangsBeschreibung',
 # ]
 input_features = [
    'VorgangsBeschreibung',
 ]
 activity_feature = 'VorgangsTypName'
 activity_types = [
    'Reparaturauftrag (Portal)',
    'Störungsmeldung',
 ]
 threshold_num_acitivities = 1
 threshold_similarity = 0.8
--- a/pdm.lock
+++ b/pdm.lock
--- a/publish.ps1
+++ b/publish.ps1
@ -0,0 +1 @@
 pdm publish -r local --skip-existing
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,139 @@
 [project]
 name = "tom-plugin"
 version = "0.1.0"
 description = "Wrapper for TOM plugins with different helper CLIs, primarily integration testing"
 authors = [
    {name = "d-opt GmbH, resp. Florian Förster", email = "f.foerster@d-opt.com"},
 ]
 dependencies = ["lang-main[spacy-md,spacy-trf]>=0.1.0"]
 requires-python = ">=3.11"
 readme = "README.md"
 license = {text = "MIT"}
 [project.scripts]
 model-download = "tom_plugin._tools._load_model:main"
 pipeline-test = "tom_plugin._tools._run:main"
 [build-system]
 requires = ["pdm-backend"]
 build-backend = "pdm.backend"
 [tool.ruff]
 line-length = 94
 indent-width = 4
 target-version = "py311"
 src = ["src"]
 [tool.ruff.format]
 quote-style = "single"
 skip-magic-trailing-comma = false
 [tool.ruff.lint]
 select = ["E", "F", "I"]
 [tool.ruff.lint.isort]
 extra-standard-library = ["typing_extensions"]
 [tool.pytest.ini_options]
 addopts = [
    "-vvl",
    "--import-mode=importlib",
 ]
 testpaths = [
    "tests",
 ]
 filterwarnings = [
    'ignore:pkg_resources is deprecated as an API.:DeprecationWarning'
 ]
 markers = [
 ]
 log_cli = true
 [tool.coverage.run]
 relative_files = true
 source = [
    "tom_plugin",
    "tests/",
 ]
 [tool.coverage.report]
 exclude_also = [
    "def __repr__",
    "def __str__",
    "@overload",
    "if logging",
    "if TYPE_CHECKING",
    "@pytest.fixture",
    "if __name__ == __main__:",
 ]
 [tool.coverage.html]
 directory = "reports/coverage"
 [tool.pdm]
 distribution = true
 [tool.pdm.resolution]
 respect-source-order = true
 [[tool.pdm.source]]
 name = "private"
 url = "http://localhost:8001/simple"
 verify_ssl = false
 [[tool.pdm.source]]
 name = "pypi"
 url = "https://pypi.org/simple"
 exclude_packages = ["lang-main*", "tom-plugin*"]
 [dependency-groups]
 dev = [
    "bump-my-version>=0.29.0",
    "jupyterlab>=4.3.4",
    "ipywidgets>=8.1.5",
 ]
 [tool.bumpversion]
 current_version = "0.1.0"
 parse = """(?x)
    (?P<major>0|[1-9]\\d*)\\.
    (?P<minor>0|[1-9]\\d*)\\.
    (?P<patch>0|[1-9]\\d*)
    (?:
                                      # separator for pre-release section
        (?P<pre_l>[a-zA-Z-]+)         # pre-release label
        (?P<pre_n>0|[1-9]\\d*)        # pre-release version number
    )?                                # pre-release section is optional
 """
 serialize = [
    "{major}.{minor}.{patch}{pre_l}{pre_n}",
    "{major}.{minor}.{patch}",
 ]
 search = "{current_version}"
 replace = "{new_version}"
 regex = false
 ignore_missing_version = false
 ignore_missing_files = false
 tag = false
 sign_tags = false
 tag_name = "v{new_version}"
 tag_message = "Bump version: {current_version} → {new_version}"
 allow_dirty = true
 commit = false
 message = "Bump version: {current_version} → {new_version}"
 commit_args = ""
 setup_hooks = []
 pre_commit_hooks = []
 post_commit_hooks = []
 [tool.bumpversion.parts.pre_l]
 values = ["dev", "a", "b", "rc", "final"]
 optional_value = "final"
 [[tool.bumpversion.files]]
 filename = "pyproject.toml"
 search = "version = \"{current_version}\""
 replace = "version = \"{new_version}\""
--- a/src/tom_plugin/init.py
+++ b/src/tom_plugin/init.py
@ -0,0 +1,3 @@
 from tom_plugin import _env_vars as env
 env.set()
--- a/src/tom_plugin/_env_vars.py
+++ b/src/tom_plugin/_env_vars.py
@ -0,0 +1,46 @@
 import os
 from typing import Final
 # ** ENV VARS
 def set() -> None:
    library_mode = os.environ.get('DOPT_TOM_PLUGIN_LIBRARY_USAGE', None)
    LIBRARY_MODE: Final[bool] = bool(library_mode)
    if LIBRARY_MODE:
        _set_lib_mode()
    else:
        _set_app_mode(
            spacy_model=None,
            STFR_model=None,
        )
 def _set_lib_mode() -> None:
    os.environ['LANG_MAIN_STFR_BACKEND'] = 'onnx'
 def _set_app_mode(
    spacy_model: str | None = None,
    STFR_model: str | None = None,
 ) -> None:
    os.environ['LANG_MAIN_STFR_BACKEND'] = 'onnx'
    os.environ['LANG_MAIN_STOP_SEARCH_FOLDERNAME'] = 'tom-plugin'
    os.environ['LANG_MAIN_BASE_FOLDERNAME'] = 'tom-plugin'
    if spacy_model is not None:
        _set_spacy_model(spacy_model)
    if STFR_model is not None:
        _set_STFR_model(STFR_model)
 def _set_spacy_model(
    model_name: str = 'de_core_news_md',
 ) -> None:
    os.environ['LANG_MAIN_SPACY_MODEL'] = model_name
 def _set_STFR_model(
    model_name: str = 'all-mpnet-base-v2',
 ) -> None:
    os.environ['LANG_MAIN_SPACY_MODEL'] = model_name
--- a/src/tom_plugin/_tools/init.py
+++ b/src/tom_plugin/_tools/init.py
--- a/src/tom_plugin/_tools/_load_model.py
+++ b/src/tom_plugin/_tools/_load_model.py
@ -0,0 +1,194 @@
 import argparse
 import os
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, cast
 from lang_main.constants import stfr_model_args_default
 from lang_main.model_loader import (
    MODEL_BASE_FOLDER,
    STFR_BACKEND,
    STFR_DEVICE,
    STFR_MODEL_ARGS,
    STFR_MODEL_NAME,
    STFR_SIMILARITY,
    load_sentence_transformer,
 )
 from lang_main.types import (
    SentenceTransformer,
    STFRBackends,
    STFRModelArgs,
    STFRONNXFilenames,
 )
 from sentence_transformers.backend import (
    export_dynamic_quantized_onnx_model,
    export_optimized_onnx_model,
 )
@dataclass
 class TypedArgumentParser:
    default: bool
    model: str
    path: Path | None
    convert: bool
    optim: bool
    quant: bool
 def get_model_name_from_repo(
    full_model_name: str,
 ) -> str:
    return full_model_name.split('/')[-1]
 def _preload_STFR_model(
    model_name_repo: str,
    backend: STFRBackends,
    model_kwargs: STFRModelArgs | dict[str, Any] | None,
    target_folder: Path | str | None,
 ) -> SentenceTransformer:
    save_folder: str | None = None
    if target_folder is not None:
        save_folder = str(target_folder)
    return load_sentence_transformer(
        model_name=model_name_repo,
        similarity_func=STFR_SIMILARITY,
        backend=backend,
        device=STFR_DEVICE,
        model_kwargs=model_kwargs,
        model_save_folder=save_folder,
        local_files_only=False,
        force_download=True,
    )
 def _load_config_STFR_model() -> None:
    _ = _preload_STFR_model(
        model_name_repo=STFR_MODEL_NAME,
        backend=STFR_BACKEND,
        model_kwargs=STFR_MODEL_ARGS,
        target_folder=None,
    )
 def _model_conversion(
    model_name_repo: str,
    quant: bool,
    optimise: bool,
    target_folder: Path | None,
 ) -> None:
    model_name = get_model_name_from_repo(model_name_repo)
    base_folder: Path = MODEL_BASE_FOLDER
    if target_folder is not None:
        base_folder = target_folder
    if base_folder.stem == 'converted':
        export_folder = (base_folder / model_name).resolve()
    else:
        export_folder = (base_folder / 'converted' / model_name).resolve()
    # attempt to download base model if not present
    _ = _preload_STFR_model(
        model_name_repo=model_name_repo,
        backend=STFRBackends.TORCH,
        model_kwargs=stfr_model_args_default,
        target_folder=base_folder,
    )
    model_onnx = _preload_STFR_model(
        model_name_repo=model_name_repo,
        backend=STFRBackends.ONNX,
        model_kwargs=None,
        target_folder=base_folder,
    )
    model_onnx.save_pretrained(path=str(export_folder), safe_serialization=True)
    path_export_onnx_base = export_folder / 'onnx' / 'model.onnx'
    assert path_export_onnx_base.exists(), 'ONNX base weights not existing'
    print(f'Saved converted ONNX model under: {path_export_onnx_base}')
    if quant:
        export_dynamic_quantized_onnx_model(
            model_onnx, quantization_config='avx2', model_name_or_path=str(export_folder)
        )
        path_export_onnx_quant = export_folder / STFRONNXFilenames.ONNX_Q_UINT8
        assert path_export_onnx_quant.exists(), 'ONNX quant weights not existing'
        print(f'Saved quantised ONNX model under: {path_export_onnx_quant}')
        os.remove(path_export_onnx_base)
    if optimise:
        export_optimized_onnx_model(
            model_onnx, optimization_config='O3', model_name_or_path=str(export_folder)
        )
        path_export_onnx_optim = export_folder / STFRONNXFilenames.ONNX_OPT_O3
        assert path_export_onnx_optim.exists(), 'ONNX optimised weights not existing'
        print(f'Saved optimised ONNX model under: {path_export_onnx_optim}')
        os.remove(path_export_onnx_base)
 def main() -> None:
    parser = argparse.ArgumentParser(
        prog='STFR-Model-Loader',
        description=(
            'Helper program to pre-download SentenceTransformer models '
            'and convert them to different formats if desired'
        ),
    )
    parser.add_argument(
        '-d', '--default', action='store_true', help='load model from default config'
    )
    parser.add_argument(
        '-m',
        '--model',
        default=STFR_MODEL_NAME,
        help='model to load (full repo name from Hugging Face Hub)',
    )
    parser.add_argument(
        '-p', '--path', type=Path, default=None, help='path to save models to'
    )
    parser.add_argument(
        '-c', '--convert', action='store_true', help='convert model to ONNX format'
    )
    parser.add_argument(
        '-o',
        '--optim',
        action='store_true',
        help=(
            'optimise ONNX model with O3 profile, , model is '
            'always converted to ONNX beforehand'
        ),
    )
    # parser.add_argument('--onnx', action='store_true', help='use ONNX backend')
    parser.add_argument(
        '--quant',
        action='store_true',
        help=(
            'quantise model with "AVX2" configuration, model is always '
            'converted to ONNX beforehand'
        ),
    )
    args = cast(TypedArgumentParser, parser.parse_args())
    use_default_model = args.default
    convert_model = args.convert
    optimise_model = args.optim
    quantise_model = args.quant
    if use_default_model and convert_model:
        raise ValueError('Loading default model does not allow model conversion')
    path_models: Path | None = None
    if args.path is not None:
        path_models = args.path.resolve()
        assert path_models.exists(), 'model saving path not existing'
        assert path_models.is_dir(), 'model saving path not a directory'
    if args.default:
        _load_config_STFR_model()
    else:
        _model_conversion(
            model_name_repo=args.model,
            quant=quantise_model,
            optimise=optimise_model,
            target_folder=path_models,
        )
--- a/src/tom_plugin/_tools/_run.py
+++ b/src/tom_plugin/_tools/_run.py
@ -0,0 +1,29 @@
 import argparse
 import time
 from datetime import timedelta
 from tom_plugin.pipeline import run_on_csv_data
 def main() -> None:
    parser = argparse.ArgumentParser(
        prog='TOM-Plugin-Demo-Runner',
        description='integration testing of provided pipelines in TOM-Plugin',
    )
    subparsers = parser.add_subparsers(dest='subparser')
    parser_csv = subparsers.add_parser('csv', help='run on CSV data')
    parser_csv.add_argument('id', help='ID for data set')
    parser_csv.add_argument('filename', help='filename from configured input directory')
    args = parser.parse_args()
    if args.subparser == 'csv':
        t1 = time.perf_counter()
        run_on_csv_data(args.id, args.filename)
        t2 = time.perf_counter()
        run_time = t2 - t1
        td = timedelta(seconds=run_time)
        print(f'Application runtime was. {td}')
 if __name__ == '__main__':
    main()
--- a/src/tom_plugin/env_vars.txt
+++ b/src/tom_plugin/env_vars.txt
@ -0,0 +1,2 @@
 # list of all library's environment variables
 DOPT_TOM_PLUGIN_LIBRARY_USAGE : indicate that this wrapper application is in library mode (used to set different environment variables)
--- a/src/tom_plugin/pipeline.py
+++ b/src/tom_plugin/pipeline.py
@ -0,0 +1,276 @@
 from __future__ import annotations
 import os
 import typing
 from pathlib import Path
 from typing import cast
 from lang_main.analysis.graphs import (
    Graph,
    TokenGraph,
    save_to_GraphML,
 )
 from lang_main.constants import (
    CYTO_BASE_NETWORK_NAME,
    INPUT_PATH_FOLDER,
    SAVE_PATH_FOLDER,
    SKIP_GRAPH_POSTPROCESSING,
    SKIP_GRAPH_RESCALING,
    SKIP_GRAPH_STATIC_RENDERING,
    SKIP_PREPROCESSING,
    SKIP_TIME_ANALYSIS,
    SKIP_TOKEN_ANALYSIS,
 )
 from lang_main.errors import DependencyMissingError
 from lang_main.io import create_saving_folder, get_entry_point, load_pickle
 from lang_main.pipelines.base import Pipeline
 from lang_main.pipelines.predefined import (
    build_base_target_feature_pipe,
    build_merge_duplicates_pipe,
    build_timeline_pipe,
    build_tk_graph_pipe,
    build_tk_graph_post_pipe,
    build_tk_graph_render_pipe,
    build_tk_graph_rescaling_pipe,
 )
 from lang_main.types import (
    EntryPoints,
    ObjectID,
    PandasIndex,
    SpacyDoc,
    TimelineCandidates,
 )
 from pandas import DataFrame
 # ** build pipelines
 pipe_target_feat_on_csv = build_base_target_feature_pipe()
 pipe_merge = build_merge_duplicates_pipe()
 pipe_token_analysis = build_tk_graph_pipe()
 pipe_graph_postprocessing = build_tk_graph_post_pipe()
 pipe_graph_rescaling = build_tk_graph_rescaling_pipe(
    save_result=True,
    exit_point=EntryPoints.TK_GRAPH_ANALYSIS_RESCALED,
 )
 pipe_timeline = build_timeline_pipe()
 pipe_static_graph_rendering: Pipeline | None = None
 # rendering depending on optional dependencies
 try:
    pipe_static_graph_rendering = build_tk_graph_render_pipe(
        with_subgraphs=True,
        base_network_name=CYTO_BASE_NETWORK_NAME,
    )
 except (ImportError, DependencyMissingError):
    pass
 all_pipes: tuple[Pipeline | None, ...] = (
    pipe_target_feat_on_csv,
    pipe_merge,
    pipe_token_analysis,
    pipe_graph_postprocessing,
    pipe_graph_rescaling,
    pipe_static_graph_rendering,
    pipe_timeline,
 )
 # ENV variable: LANG_MAIN_SAVE_FOLDER : path for saving folder of current run
 # ENV variable: LANG_MAIN_INPUT_DATA : path for input data of current run
 def get_save_folder() -> Path:
    save_folder_env = os.environ.get('LANG_MAIN_SAVE_FOLDER', None)
    assert save_folder_env is not None, 'saving folder not defined as ENV variable'
    save_folder = Path(save_folder_env)
    assert save_folder.exists(), 'save folder does not exist'
    return save_folder
 def get_path_to_dataset() -> Path:
    data_pth_env = os.environ.get('LANG_MAIN_INPUT_DATA', None)
    assert data_pth_env is not None, 'path to dataset not defined as ENV variable'
    data_pth = Path(data_pth_env)
    assert data_pth.exists(), 'path to dataset does not exist'
    return data_pth
 def _set_save_folder(
    target_folder: Path,
 ) -> None:
    # save_folder = get_save_folder()
    for pipe in all_pipes:
        if pipe is not None:
            pipe.working_dir = target_folder
 # ** preparation
 def _prepare_run_on_csv(
    id: str,
    filename: str,
 ) -> tuple[Path, Path]:
    # output directory for intermediate results
    print(f'Saving path: {SAVE_PATH_FOLDER}', flush=True)
    target_folder = SAVE_PATH_FOLDER / id
    create_saving_folder(
        saving_path_folder=target_folder,
        overwrite_existing=True,
    )
    assert target_folder.exists(), 'target folder not existing after creation'
    # data set
    data_pth = (INPUT_PATH_FOLDER / filename).with_suffix('.csv')
    assert data_pth.exists(), 'path to data not existing'
    assert data_pth.is_file(), 'data is not a file'
    print(f'Data path: {data_pth}', flush=True)
    return target_folder, data_pth
 # ** preprocessing pipeline
 def _run_preprocessing_on_csv(
    target_folder: Path,
    data_pth: Path,
 ) -> Path:
    # data_pth = get_path_to_dataset()
    # run pipelines
    ret = typing.cast(
        tuple[DataFrame], pipe_target_feat_on_csv.run(starting_values=(data_pth,))
    )
    target_feat_data = ret[0]
    _ = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(target_feat_data,)))
    return target_folder
 # ** token analysis
 def _run_token_analysis(
    target_folder: Path,
 ) -> Path:
    # load entry point
    # save_folder = get_save_folder()
    entry_point_path = get_entry_point(target_folder, EntryPoints.TOKEN_ANALYSIS)
    loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
    preprocessed_data = loaded_results[0]
    # build token graph
    (tk_graph, _) = typing.cast(
        tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
        pipe_token_analysis.run(starting_values=(preprocessed_data,)),
    )
    tk_graph.to_GraphML(target_folder, filename='TokenGraph', directed=False)
    return target_folder
 def _run_graph_postprocessing(
    target_folder: Path,
 ) -> Path:
    # load entry point
    # save_folder = get_save_folder()
    entry_point_path = get_entry_point(target_folder, EntryPoints.TK_GRAPH_POST)
    loaded_results = cast(
        tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
        load_pickle(entry_point_path),
    )
    tk_graph = loaded_results[0]
    # filter graph by edge weight and remove single nodes (no connection)
    ret = cast(tuple[TokenGraph], pipe_graph_postprocessing.run(starting_values=(tk_graph,)))
    tk_graph_filtered = ret[0]
    tk_graph_filtered.to_GraphML(
        target_folder, filename='TokenGraph-filtered', directed=False
    )
    return target_folder
 def _run_graph_edge_rescaling(
    target_folder: Path,
 ) -> Path:
    # load entry point
    # save_folder = get_save_folder()
    entry_point_path = get_entry_point(target_folder, EntryPoints.TK_GRAPH_ANALYSIS)
    loaded_results = cast(
        tuple[TokenGraph],
        load_pickle(entry_point_path),
    )
    tk_graph = loaded_results[0]
    tk_graph_rescaled, tk_graph_rescaled_undirected = cast(
        tuple[TokenGraph, Graph], pipe_graph_rescaling.run(starting_values=(tk_graph,))
    )
    tk_graph_rescaled.to_GraphML(
        target_folder, filename='TokenGraph-directed-rescaled', directed=False
    )
    save_to_GraphML(
        tk_graph_rescaled_undirected,
        saving_path=target_folder,
        filename='TokenGraph-undirected-rescaled',
    )
    return target_folder
 def _run_static_graph_rendering(
    target_folder: Path,
 ) -> Path:
    # load entry point
    # save_folder = get_save_folder()
    entry_point_path = get_entry_point(
        target_folder,
        EntryPoints.TK_GRAPH_ANALYSIS_RESCALED,
    )
    loaded_results = cast(
        tuple[TokenGraph, Graph],
        load_pickle(entry_point_path),
    )
    _ = loaded_results[0]
    tk_graph_rescaled_undirected = loaded_results[1]
    if pipe_static_graph_rendering is not None:
        _ = pipe_static_graph_rendering.run(starting_values=(tk_graph_rescaled_undirected,))
    return target_folder
 # ** time analysis
 def _run_time_analysis(
    target_folder: Path,
 ) -> Path:
    # load entry point
    # save_folder = get_save_folder()
    entry_point_path = get_entry_point(target_folder, EntryPoints.TIMELINE)
    loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
    preprocessed_data = loaded_results[0]
    _ = cast(
        tuple[TimelineCandidates, dict[ObjectID, str]],
        pipe_timeline.run(starting_values=(preprocessed_data,)),
    )
    return target_folder
 def _build_pipeline_container(
    target_folder: Path,
 ) -> Pipeline:
    # save_folder = get_save_folder()
    # container = PipelineContainer(name='Pipeline-Container-Base', working_dir=target_folder)
    container = Pipeline(name='Pipeline-Base', working_dir=target_folder)
    container.add(_run_preprocessing_on_csv, skip=SKIP_PREPROCESSING)
    container.add(_run_token_analysis, skip=SKIP_TOKEN_ANALYSIS)
    container.add(_run_graph_postprocessing, skip=SKIP_GRAPH_POSTPROCESSING)
    container.add(_run_graph_edge_rescaling, skip=SKIP_GRAPH_RESCALING)
    container.add(_run_static_graph_rendering, skip=SKIP_GRAPH_STATIC_RENDERING)
    container.add(_run_time_analysis, skip=SKIP_TIME_ANALYSIS)
    return container
 def run_on_csv_data(
    id: str,
    filename: str,
 ) -> None:
    target_folder, data_pth = _prepare_run_on_csv(id=id, filename=filename)
    _set_save_folder(target_folder)
    procedure = _build_pipeline_container(target_folder)
    procedure.run(starting_values=(target_folder, data_pth))
--- a/test.py
+++ b/test.py
@ -0,0 +1,21 @@
 import os
 os.environ['LANG_MAIN_STOP_SEARCH_FOLDERNAME'] = 'python'
 os.environ['LANG_MAIN_BASE_FOLDERNAME'] = 'bin'
 from tom_plugin import pipeline
 def run_pipe() -> None:
    # lang-data\in\Dummy_Dataset_N_1000.csv
    # relative_path = r'.\lang-data\in\Dummy_Dataset_N_1000.csv'
    # absolute_path = r'A:\Arbeitsaufgaben\lang-data\in\Dummy_Dataset_N_1000.csv'
    filename: str = 'Dummy_Dataset_N_1000.csv'
    # pipeline.run_on_csv_data(id='123', filename=relative_path)
    # pipeline.run_on_csv_data(id='124', filename=absolute_path)
    pipeline.run_on_csv_data(id='1234', filename=filename)
 if __name__ == '__main__':
    run_pipe()
--- a/tests/init.py
+++ b/tests/init.py
--- a/tom-plugin.code-workspace
+++ b/tom-plugin.code-workspace
@ -0,0 +1,8 @@
 {
 	"folders": [
 		{
 			"path": "."
 		}
 	],
 	"settings": {}
 }
--- a/update_and_publish.ps1
+++ b/update_and_publish.ps1
@ -0,0 +1,2 @@
 pdm update -u -x lang-main
 pdm publish -r local --skip-existing
--- a/update_lang_main.ps1
+++ b/update_lang_main.ps1
@ -0,0 +1 @@
 pdm update -u -x lang-main
		`@ -0,0 +1,2 @@`
							`pdm run bump-my-version bump pre_n`
							`pdm run bump-my-version show current_version`
		`@ -0,0 +1,2 @@`
							`pdm run bump-my-version bump pre_l`
							`pdm run bump-my-version show current_version`
		`@ -0,0 +1,3 @@`
							`from tom_plugin import _env_vars as env`

							`env.set()`
		`@ -0,0 +1,2 @@`
							`# list of all library's environment variables`
							`DOPT_TOM_PLUGIN_LIBRARY_USAGE : indicate that this wrapper application is in library mode (used to set different environment variables)`
		`@ -0,0 +1,2 @@`
							`pdm update -u -x lang-main`
							`pdm publish -r local --skip-existing`