In [1]:
from pathlib import Path
import time

from stfr import load_models

import pandas as pd
import numpy as np

Looking iteratively for config file. Start: A:\Arbeitsaufgaben\tom-plugin\.venv\Lib\site-packages\lang_main, stop folder: tom-plugin
Loaded TOML config file successfully.
Loaded config from: >>A:\Arbeitsaufgaben\tom-plugin\lang_main_config.toml<<
Library path is: A:\Arbeitsaufgaben
Root path is: A:\Arbeitsaufgaben


In [2]:
from lang_main.types import STFRModelTypes

In [3]:
from lang_main.pipelines import predefined
from lang_main.analysis import preprocessing as preproc

In [28]:
DATA_PTH = Path(r'A:\Arbeitsaufgaben\lang-data\in\02_202307\Export4.csv')

def preprocess_data(path, num_entries):
    assert path.exists()
    pipe_target_feat = predefined.build_base_target_feature_pipe()
    ret = pipe_target_feat.run(starting_values=(path,))
    df = ret[0]
    cropped = df.iloc[:num_entries]
    entries = tuple(cropped['entry'])

    return entries

In [29]:
models_benchmark = (
    # STFRModelTypes.ALL_MPNET_BASE_V2,
    # STFRModelTypes.PARAPHRASE_MULTI_MPNET_BASE_V2,
    # STFRModelTypes.JINAAI_BASE_DE_V2,
    STFRModelTypes.GERMAN_SEMANTIC_STS_V2,
    STFRModelTypes.E5_BASE_STS_EN_DE,
)

In [72]:
def benchmark_sims(model, docs, batch_size=32):
    if model.max_seq_length > 1024:
        model.max_seq_length = 1024

    t1 = time.perf_counter()
    embds = model.encode(docs, convert_to_numpy=False, convert_to_tensor=True, batch_size=batch_size)
    sims = model.similarity(embds, embds).numpy()
    t2 = time.perf_counter()
    encoding_dur = t2 - t1
    
    return sims, encoding_dur

In [31]:
def run_benchmark(models, docs, iterations_per_model):
    print(f'Benchmark for number of entries: {len(docs)}')
    print(f'Iterations per model: {iterations_per_model}')
    
    for model_name in models:
        times_with_load = []
        times_encoding = []
        model = load_models(model_name, trust_remote=True)
        
        for it in range(iterations_per_model):
            t1 = time.perf_counter()
            sims, encoding_dur = benchmark_sims(model, docs)
            t2 = time.perf_counter()
            duration = t2 - t1
            times_with_load.append(duration)
            times_encoding.append(encoding_dur)

        avg_time_with_load = np.mean(times_with_load)
        avg_time_encoding = np.mean(times_encoding)
        print(f'Avg time for model >{model_name}< was:\t\twith loading: {avg_time_with_load:.6f} s\tencoding: {avg_time_encoding:.6f} s')
        

In [32]:
docs = preprocess_data(DATA_PTH, num_entries=1000)

2025-01-15 12:04:37 +0000 | lang_main:base:INFO | Starting pipeline >>Target_Feature<<...
2025-01-15 12:04:38 +0000 | lang_main:preprocessing:INFO | Loaded dataset successfully.
2025-01-15 12:04:38 +0000 | lang_main:preprocessing:INFO | Dataset properties: number of entries: 129020, number of features 20
2025-01-15 12:04:38 +0000 | lang_main:preprocessing:INFO | Number of duplicates over all features: 84
2025-01-15 12:04:38 +0000 | lang_main:preprocessing:INFO | Number of duplicates over subset >>['VorgangsID', 'ObjektID']<<: 725
2025-01-15 12:04:38 +0000 | lang_main:preprocessing:INFO | Removed all duplicates from dataset successfully.
2025-01-15 12:04:38 +0000 | lang_main:preprocessing:INFO | New Dataset properties: number of entries: 128211, number of features 20
2025-01-15 12:04:38 +0000 | lang_main:preprocessing:INFO | Removed NA entries for features >>['VorgangsBeschreibung']<< from dataset successfully.
2025-01-15 12:04:38 +0000 | lang_main:io:INFO | Saved file successfully unde

In [33]:
ITERATIONS_PER_MODEL = 3

In [34]:
run_benchmark(models_benchmark, docs, ITERATIONS_PER_MODEL)

No sentence-transformers model found with name aari1995/German_Semantic_STS_V2. Creating a new one with mean pooling.


Benchmark for number of entries: 1000
Iterations per model: 3
Avg time for model >aari1995/German_Semantic_STS_V2< was:		with loading: 194.963239 s	encoding: 194.963216 s
Avg time for model >danielheinz/e5-base-sts-en-de< was:		with loading: 52.009722 s	encoding: 52.009704 s


In [84]:
from sentence_transformers.SentenceTransformer import SentenceTransformer
from sentence_transformers.backend import export_optimized_onnx_model, export_dynamic_quantized_onnx_model

In [40]:
model_name = STFRModelTypes.E5_BASE_STS_EN_DE
save_path = Path(r'A:\Arbeitsaufgaben\lang-models\self-onnx')
assert save_path.exists()

In [39]:
model_onnx = SentenceTransformer(model_name, backend='onnx')

No 'model.onnx' found in 'danielheinz/e5-base-sts-en-de'. Exporting the model to ONNX.


tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/756 [00:00<?, ?B/s]

Saving the exported ONNX model is heavily recommended to avoid having to export it again. Do so with `model.push_to_hub(<STFRModelTypes.E5_BASE_STS_EN_DE: 'danielheinz/e5-base-sts-en-de'>, create_pr=True)`.


In [45]:
save_base = save_path / 'base'

In [46]:
model_onnx.save_pretrained(str(save_base), safe_serialization=True)

In [49]:
#save_optimised = save_path / 'optimised'

In [50]:
export_optimized_onnx_model(model_onnx, optimization_config='O3', model_name_or_path=str(save_base))



In [54]:
onnx_path = save_base / 'onnx/model_O3.onnx'
assert onnx_path.exists()

In [85]:
model_kwargs = {'file_name': 'onnx/model.onnx', 'provider': 'CPUExecutionProvider', 'export': False}

In [86]:
model_optim = SentenceTransformer(str(save_base), model_kwargs=model_kwargs, device='cpu', backend='onnx')

In [87]:
export_dynamic_quantized_onnx_model(model_optim, quantization_config='avx2', model_name_or_path=str(save_base))

In [88]:
model_kwargs = {'file_name': 'onnx/model_quint8_avx2.onnx', 'provider': 'CPUExecutionProvider', 'export': False}

In [89]:
model_quant = SentenceTransformer(str(save_base), model_kwargs=model_kwargs, device='cpu', backend='onnx')

The ONNX file model_quint8_avx2.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


In [90]:
sims, dur = benchmark_sims(model_quant, docs, batch_size=32)

In [91]:
dur

54.369837799997185

In [1]:
from pathlib import Path

In [2]:
p = Path(r'A:\Arbeitsaufgaben\test-download\lang-models')
p.exists()

False

In [129]:
t = 'test/123'

In [130]:
t.split('/')[-1]

'123'

In [78]:
sims, dur = benchmark_sims(model_optim, docs, batch_size=16)

In [79]:
dur

90.01054789999944

In [70]:
model_ref = SentenceTransformer('danielheinz/e5-base-sts-en-de')

In [71]:
sims, dur = benchmark_sims(model_ref, docs)
dur

74.87911759999952

In [68]:
model_name_new = 'mixedbread-ai/deepset-mxbai-embed-de-large-v1'
model_name_ref = 'all-mpnet-base-v2'


In [66]:
docs = [
    'Ölleckage durch undichten Ölsumpf',
    'Überprüfung der Schwingungsdämpfer',
    'Überprüfung der Kühlmittelsysteme',
    'Blockierung der Förderschnecke',
    'Überhitzung durch mangelnde Kühlmittelzirkulation',
    'Überprüfung der Hydraulik',
    'Ich gehe spazieren',
    'Heute um zwölf war ich unterwegs',
    'Ich gehe mit dem Hund raus',
    'Ich laufe im Park',
    'Ich laufe im Pakr',
]

In [67]:
def calc_similarities(model_name, docs):
    model = load_models(model_name)
    embds = model.encode(docs)
    sims = model.similarity(embds, embds).numpy()
    df = pd.DataFrame(data=sims, index=docs, columns=docs)
    
    return df

In [5]:
calc_similarities(model_name_ref, docs)

Unnamed: 0,Ölleckage durch undichten Ölsumpf,Überprüfung der Schwingungsdämpfer,Überprüfung der Kühlmittelsysteme,Blockierung der Förderschnecke,Überhitzung durch mangelnde Kühlmittelzirkulation,Überprüfung der Hydraulik,Ich gehe spazieren,Heute um zwölf war ich unterwegs,Ich gehe mit dem Hund raus,Ich laufe im Park,Ich laufe im Pakr
Ölleckage durch undichten Ölsumpf,1.0,0.422875,0.393899,0.23069,0.521921,0.275985,0.286303,0.401496,0.289342,0.343332,0.322299
Überprüfung der Schwingungsdämpfer,0.422875,1.0,0.519197,0.274956,0.489307,0.445541,0.302303,0.401394,0.216164,0.357596,0.284001
Überprüfung der Kühlmittelsysteme,0.393899,0.519197,1.0,0.315556,0.706944,0.486024,0.298462,0.270384,0.224679,0.253847,0.260867
Blockierung der Förderschnecke,0.23069,0.274956,0.315556,1.0,0.302503,0.249371,0.30122,0.239805,0.18149,0.296925,0.239862
Überhitzung durch mangelnde Kühlmittelzirkulation,0.521921,0.489307,0.706944,0.302503,1.0,0.405318,0.315011,0.385622,0.295077,0.295949,0.316965
Überprüfung der Hydraulik,0.275985,0.445541,0.486024,0.249371,0.405318,1.0,0.213862,0.186064,0.199054,0.183982,0.144403
Ich gehe spazieren,0.286303,0.302303,0.298462,0.30122,0.315011,0.213862,1.0,0.467547,0.545281,0.554993,0.480685
Heute um zwölf war ich unterwegs,0.401496,0.401394,0.270384,0.239805,0.385622,0.186064,0.467547,1.0,0.489523,0.481612,0.413731
Ich gehe mit dem Hund raus,0.289342,0.216164,0.224679,0.18149,0.295077,0.199054,0.545281,0.489523,1.0,0.524824,0.517329
Ich laufe im Park,0.343332,0.357596,0.253847,0.296925,0.295949,0.183982,0.554993,0.481612,0.524824,1.0,0.693493


In [69]:
docs

['Ölleckage durch undichten Ölsumpf',
 'Überprüfung der Schwingungsdämpfer',
 'Überprüfung der Kühlmittelsysteme',
 'Blockierung der Förderschnecke',
 'Überhitzung durch mangelnde Kühlmittelzirkulation',
 'Überprüfung der Hydraulik',
 'Ich gehe spazieren',
 'Heute um zwölf war ich unterwegs',
 'Ich gehe mit dem Hund raus',
 'Ich laufe im Park',
 'Ich laufe im Pakr']

In [70]:
# adaption to new model
query = 'query: Retrieve semantically similar text: '
new_docs = []
for doc in docs:
    new_doc = 'passage: ' + doc
    new_docs.append(new_doc)

new_docs

['passage: Ölleckage durch undichten Ölsumpf',
 'passage: Überprüfung der Schwingungsdämpfer',
 'passage: Überprüfung der Kühlmittelsysteme',
 'passage: Blockierung der Förderschnecke',
 'passage: Überhitzung durch mangelnde Kühlmittelzirkulation',
 'passage: Überprüfung der Hydraulik',
 'passage: Ich gehe spazieren',
 'passage: Heute um zwölf war ich unterwegs',
 'passage: Ich gehe mit dem Hund raus',
 'passage: Ich laufe im Park',
 'passage: Ich laufe im Pakr']

In [7]:
from sentence_transformers.SentenceTransformer import SentenceTransformer

In [8]:
dimensions = 1024
model = SentenceTransformer(STFRModelTypes.PARAPHRASE_MULTI_MPNET_BASE_V2)

In [92]:
model = model_quant

In [73]:
#model = load_models(model_name_new)
embds = model.encode(new_docs, convert_to_numpy=False, convert_to_tensor=True)
sims = model.similarity(embds, embds).numpy()
sims.shape
df_new_model = pd.DataFrame(data=sims, index=new_docs, columns=new_docs)
df_new_model

Unnamed: 0,passage: Ölleckage durch undichten Ölsumpf,passage: Überprüfung der Schwingungsdämpfer,passage: Überprüfung der Kühlmittelsysteme,passage: Blockierung der Förderschnecke,passage: Überhitzung durch mangelnde Kühlmittelzirkulation,passage: Überprüfung der Hydraulik,passage: Ich gehe spazieren,passage: Heute um zwölf war ich unterwegs,passage: Ich gehe mit dem Hund raus,passage: Ich laufe im Park,passage: Ich laufe im Pakr
passage: Ölleckage durch undichten Ölsumpf,1.0,0.779681,0.784307,0.823721,0.830782,0.804758,0.670097,0.705543,0.686085,0.65767,0.72428
passage: Überprüfung der Schwingungsdämpfer,0.779681,1.0,0.896381,0.840342,0.785065,0.899921,0.703496,0.718232,0.688768,0.69842,0.731654
passage: Überprüfung der Kühlmittelsysteme,0.784307,0.896381,1.0,0.796926,0.833327,0.886805,0.656722,0.712874,0.667584,0.659513,0.721201
passage: Blockierung der Förderschnecke,0.823721,0.840342,0.796926,1.0,0.793991,0.829363,0.668617,0.691422,0.687467,0.687888,0.734692
passage: Überhitzung durch mangelnde Kühlmittelzirkulation,0.830782,0.785065,0.833327,0.793991,1.0,0.788585,0.674641,0.70235,0.690236,0.683493,0.741511
passage: Überprüfung der Hydraulik,0.804758,0.899921,0.886805,0.829363,0.788585,1.0,0.690285,0.726948,0.705504,0.698003,0.739768
passage: Ich gehe spazieren,0.670097,0.703496,0.656722,0.668617,0.674641,0.690285,1.0,0.782636,0.839147,0.842675,0.794528
passage: Heute um zwölf war ich unterwegs,0.705543,0.718232,0.712874,0.691422,0.70235,0.726948,0.782636,1.0,0.752759,0.727476,0.754443
passage: Ich gehe mit dem Hund raus,0.686085,0.688768,0.667584,0.687467,0.690236,0.705504,0.839147,0.752759,1.0,0.769349,0.764451
passage: Ich laufe im Park,0.65767,0.69842,0.659513,0.687888,0.683493,0.698003,0.842675,0.727476,0.769349,1.0,0.88406


In [9]:
sims

array([[1.0000001 , 0.77968144, 0.7843068 , 0.8237207 , 0.830782  ,
        0.8047581 , 0.6700973 , 0.7055429 , 0.6860854 , 0.6576705 ,
        0.72428024],
       [0.77968144, 1.        , 0.8963811 , 0.84034187, 0.7850649 ,
        0.8999206 , 0.7034964 , 0.7182317 , 0.68876797, 0.69842   ,
        0.73165405],
       [0.7843068 , 0.8963811 , 1.        , 0.7969257 , 0.8333273 ,
        0.8868046 , 0.6567219 , 0.71287364, 0.66758376, 0.6595131 ,
        0.7212007 ],
       [0.8237207 , 0.84034187, 0.7969257 , 1.0000001 , 0.793991  ,
        0.82936305, 0.66861665, 0.6914221 , 0.68746734, 0.68788785,
        0.73469234],
       [0.830782  , 0.7850649 , 0.8333273 , 0.793991  , 1.0000001 ,
        0.7885847 , 0.6746406 , 0.7023505 , 0.6902363 , 0.68349344,
        0.7415106 ],
       [0.8047581 , 0.8999206 , 0.8868046 , 0.82936305, 0.7885847 ,
        1.0000002 , 0.6902847 , 0.72694767, 0.7055044 , 0.6980032 ,
        0.7397681 ],
       [0.6700973 , 0.7034964 , 0.6567219 , 0.66861665, 0.

In [139]:
docs

['Ölleckage durch undichten Ölsumpf',
 'Überprüfung der Schwingungsdämpfer',
 'Überprüfung der Kühlmittelsysteme',
 'Blockierung der Förderschnecke',
 'Überhitzung durch mangelnde Kühlmittelzirkulation',
 'Überprüfung der Hydraulik',
 'Ich gehe spazieren',
 'Heute um zwölf war ich unterwegs',
 'Ich gehe mit dem Hund raus',
 'Ich laufe im Park',
 'Ich laufe im Pakr']

In [10]:
model_alt = load_models('aari1995/German_Semantic_STS_V2')

No sentence-transformers model found with name aari1995/German_Semantic_STS_V2. Creating a new one with mean pooling.


In [11]:
def load_alt_model(model_name, docs):
    model_alt = load_models(model_name)
    embds = model_alt.encode(docs, convert_to_numpy=False, convert_to_tensor=True)
    sims = model_alt.similarity(embds, embds).numpy()
    df_alt_model = pd.DataFrame(data=sims, index=docs, columns=docs)
    
    return df_alt_model

In [12]:
df_ref = calc_similarities(model_name_ref, docs)
df_ref

Unnamed: 0,Ölleckage durch undichten Ölsumpf,Überprüfung der Schwingungsdämpfer,Überprüfung der Kühlmittelsysteme,Blockierung der Förderschnecke,Überhitzung durch mangelnde Kühlmittelzirkulation,Überprüfung der Hydraulik,Ich gehe spazieren,Heute um zwölf war ich unterwegs,Ich gehe mit dem Hund raus,Ich laufe im Park,Ich laufe im Pakr
Ölleckage durch undichten Ölsumpf,1.0,0.422875,0.393899,0.23069,0.521921,0.275985,0.286303,0.401496,0.289342,0.343332,0.322299
Überprüfung der Schwingungsdämpfer,0.422875,1.0,0.519197,0.274956,0.489307,0.445541,0.302303,0.401394,0.216164,0.357596,0.284001
Überprüfung der Kühlmittelsysteme,0.393899,0.519197,1.0,0.315556,0.706944,0.486024,0.298462,0.270384,0.224679,0.253847,0.260867
Blockierung der Förderschnecke,0.23069,0.274956,0.315556,1.0,0.302503,0.249371,0.30122,0.239805,0.18149,0.296925,0.239862
Überhitzung durch mangelnde Kühlmittelzirkulation,0.521921,0.489307,0.706944,0.302503,1.0,0.405318,0.315011,0.385622,0.295077,0.295949,0.316965
Überprüfung der Hydraulik,0.275985,0.445541,0.486024,0.249371,0.405318,1.0,0.213862,0.186064,0.199054,0.183982,0.144403
Ich gehe spazieren,0.286303,0.302303,0.298462,0.30122,0.315011,0.213862,1.0,0.467547,0.545281,0.554993,0.480685
Heute um zwölf war ich unterwegs,0.401496,0.401394,0.270384,0.239805,0.385622,0.186064,0.467547,1.0,0.489523,0.481612,0.413731
Ich gehe mit dem Hund raus,0.289342,0.216164,0.224679,0.18149,0.295077,0.199054,0.545281,0.489523,1.0,0.524824,0.517329
Ich laufe im Park,0.343332,0.357596,0.253847,0.296925,0.295949,0.183982,0.554993,0.481612,0.524824,1.0,0.693493


In [13]:
df_new_model

Unnamed: 0,passage: Ölleckage durch undichten Ölsumpf,passage: Überprüfung der Schwingungsdämpfer,passage: Überprüfung der Kühlmittelsysteme,passage: Blockierung der Förderschnecke,passage: Überhitzung durch mangelnde Kühlmittelzirkulation,passage: Überprüfung der Hydraulik,passage: Ich gehe spazieren,passage: Heute um zwölf war ich unterwegs,passage: Ich gehe mit dem Hund raus,passage: Ich laufe im Park,passage: Ich laufe im Pakr
passage: Ölleckage durch undichten Ölsumpf,1.0,0.779681,0.784307,0.823721,0.830782,0.804758,0.670097,0.705543,0.686085,0.65767,0.72428
passage: Überprüfung der Schwingungsdämpfer,0.779681,1.0,0.896381,0.840342,0.785065,0.899921,0.703496,0.718232,0.688768,0.69842,0.731654
passage: Überprüfung der Kühlmittelsysteme,0.784307,0.896381,1.0,0.796926,0.833327,0.886805,0.656722,0.712874,0.667584,0.659513,0.721201
passage: Blockierung der Förderschnecke,0.823721,0.840342,0.796926,1.0,0.793991,0.829363,0.668617,0.691422,0.687467,0.687888,0.734692
passage: Überhitzung durch mangelnde Kühlmittelzirkulation,0.830782,0.785065,0.833327,0.793991,1.0,0.788585,0.674641,0.70235,0.690236,0.683493,0.741511
passage: Überprüfung der Hydraulik,0.804758,0.899921,0.886805,0.829363,0.788585,1.0,0.690285,0.726948,0.705504,0.698003,0.739768
passage: Ich gehe spazieren,0.670097,0.703496,0.656722,0.668617,0.674641,0.690285,1.0,0.782636,0.839147,0.842675,0.794528
passage: Heute um zwölf war ich unterwegs,0.705543,0.718232,0.712874,0.691422,0.70235,0.726948,0.782636,1.0,0.752759,0.727476,0.754443
passage: Ich gehe mit dem Hund raus,0.686085,0.688768,0.667584,0.687467,0.690236,0.705504,0.839147,0.752759,1.0,0.769349,0.764451
passage: Ich laufe im Park,0.65767,0.69842,0.659513,0.687888,0.683493,0.698003,0.842675,0.727476,0.769349,1.0,0.88406


In [14]:
MODEL = 'aari1995/German_Semantic_STS_V2'
df_alt_model = load_alt_model(MODEL, docs)
df_alt_model

No sentence-transformers model found with name aari1995/German_Semantic_STS_V2. Creating a new one with mean pooling.


Unnamed: 0,Ölleckage durch undichten Ölsumpf,Überprüfung der Schwingungsdämpfer,Überprüfung der Kühlmittelsysteme,Blockierung der Förderschnecke,Überhitzung durch mangelnde Kühlmittelzirkulation,Überprüfung der Hydraulik,Ich gehe spazieren,Heute um zwölf war ich unterwegs,Ich gehe mit dem Hund raus,Ich laufe im Park,Ich laufe im Pakr
Ölleckage durch undichten Ölsumpf,1.0,0.503683,0.541287,0.588917,0.61116,0.597156,0.440487,0.447261,0.434366,0.48215,0.536017
Überprüfung der Schwingungsdämpfer,0.503683,1.0,0.687819,0.584369,0.551573,0.674399,0.474264,0.466305,0.437665,0.480334,0.504029
Überprüfung der Kühlmittelsysteme,0.541287,0.687819,1.0,0.613677,0.766551,0.77909,0.434425,0.462885,0.410406,0.478038,0.470977
Blockierung der Förderschnecke,0.588917,0.584369,0.613677,1.0,0.616717,0.58942,0.460361,0.45932,0.411345,0.511038,0.511924
Überhitzung durch mangelnde Kühlmittelzirkulation,0.61116,0.551573,0.766551,0.616717,1.0,0.563489,0.475599,0.522965,0.406199,0.491175,0.512755
Überprüfung der Hydraulik,0.597156,0.674399,0.77909,0.58942,0.563489,1.0,0.448907,0.427661,0.434419,0.46877,0.490589
Ich gehe spazieren,0.440487,0.474264,0.434425,0.460361,0.475599,0.448907,1.0,0.709718,0.708631,0.801886,0.719697
Heute um zwölf war ich unterwegs,0.447261,0.466305,0.462885,0.45932,0.522965,0.427661,0.709718,1.0,0.570469,0.56551,0.590207
Ich gehe mit dem Hund raus,0.434366,0.437665,0.410406,0.411345,0.406199,0.434419,0.708631,0.570469,1.0,0.597758,0.54443
Ich laufe im Park,0.48215,0.480334,0.478038,0.511038,0.491175,0.46877,0.801886,0.56551,0.597758,1.0,0.840089


In [15]:
MODEL = 'paraphrase-multilingual-mpnet-base-v2'
df_alt_model2 = load_alt_model(MODEL, docs)
df_alt_model2

Unnamed: 0,Ölleckage durch undichten Ölsumpf,Überprüfung der Schwingungsdämpfer,Überprüfung der Kühlmittelsysteme,Blockierung der Förderschnecke,Überhitzung durch mangelnde Kühlmittelzirkulation,Überprüfung der Hydraulik,Ich gehe spazieren,Heute um zwölf war ich unterwegs,Ich gehe mit dem Hund raus,Ich laufe im Park,Ich laufe im Pakr
Ölleckage durch undichten Ölsumpf,1.0,0.536319,0.429156,0.552233,0.434099,0.650479,0.162975,0.234531,0.152853,0.143636,0.30778
Überprüfung der Schwingungsdämpfer,0.536319,1.0,0.688,0.573079,0.453239,0.740294,0.125701,0.161781,0.121361,0.125878,0.239996
Überprüfung der Kühlmittelsysteme,0.429156,0.688,1.0,0.387116,0.770696,0.642362,0.123575,0.132245,0.078901,0.103046,0.202816
Blockierung der Förderschnecke,0.552233,0.573079,0.387116,1.0,0.392658,0.481602,0.165187,0.190671,0.102584,0.165351,0.357348
Überhitzung durch mangelnde Kühlmittelzirkulation,0.434099,0.453239,0.770696,0.392658,1.0,0.400166,0.06479,0.122777,0.057485,0.093556,0.185604
Überprüfung der Hydraulik,0.650479,0.740294,0.642362,0.481602,0.400166,1.0,0.148099,0.194394,0.142136,0.129403,0.241858
Ich gehe spazieren,0.162975,0.125701,0.123575,0.165187,0.06479,0.148099,1.0,0.604538,0.693227,0.605779,0.57728
Heute um zwölf war ich unterwegs,0.234531,0.161781,0.132245,0.190671,0.122777,0.194394,0.604538,1.0,0.517527,0.488817,0.614568
Ich gehe mit dem Hund raus,0.152853,0.121361,0.078901,0.102584,0.057485,0.142136,0.693227,0.517527,1.0,0.481252,0.606504
Ich laufe im Park,0.143636,0.125878,0.103046,0.165351,0.093556,0.129403,0.605779,0.488817,0.481252,1.0,0.638209


---

In [57]:
from sentence_transformers.SentenceTransformer import SentenceTransformer

In [59]:
#model = SentenceTransformer('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True)

configuration_bert.py:   0%|          | 0.00/8.24k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/jinaai/jina-bert-implementation:
- configuration_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


In [39]:
model = load_models(STFRModelTypes.JINAAI_BASE_DE_V2, trust_remote=True, use_onnx=True)
if model.max_seq_length > 1024:
    model.max_seq_length = 1024

In [40]:
docs

['Ölleckage durch undichten Ölsumpf',
 'Überprüfung der Schwingungsdämpfer',
 'Überprüfung der Kühlmittelsysteme',
 'Blockierung der Förderschnecke',
 'Überhitzung durch mangelnde Kühlmittelzirkulation',
 'Überprüfung der Hydraulik',
 'Ich gehe spazieren',
 'Heute um zwölf war ich unterwegs',
 'Ich gehe mit dem Hund raus',
 'Ich laufe im Park',
 'Ich laufe im Pakr']

In [41]:
embeddings = model.encode(docs)

In [42]:
sims = model.similarity(embeddings, embeddings).numpy()

In [21]:
import torch
from transformers import AutoModel
from numpy.linalg import norm

from sentence_transformers.util import cos_sim

In [22]:
#cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-de', trust_remote_code=True, torch_dtype=torch.float32)
embeddings = model.encode(docs, max_length=1024)
#print(cos_sim(embeddings, embeddings))

In [43]:
#sims = cos_sim(embeddings, embeddings).numpy()
df_jinaai_model = pd.DataFrame(data=sims, index=docs, columns=docs)

In [44]:
df_jinaai_model

Unnamed: 0,Ölleckage durch undichten Ölsumpf,Überprüfung der Schwingungsdämpfer,Überprüfung der Kühlmittelsysteme,Blockierung der Förderschnecke,Überhitzung durch mangelnde Kühlmittelzirkulation,Überprüfung der Hydraulik,Ich gehe spazieren,Heute um zwölf war ich unterwegs,Ich gehe mit dem Hund raus,Ich laufe im Park,Ich laufe im Pakr
Ölleckage durch undichten Ölsumpf,1.0,0.300847,0.219643,0.418877,0.303103,0.335572,0.291617,0.269322,0.249258,0.248138,0.299106
Überprüfung der Schwingungsdämpfer,0.300847,1.0,0.690428,0.395861,0.311964,0.664567,0.104605,0.123543,0.098958,0.06827,0.231832
Überprüfung der Kühlmittelsysteme,0.219643,0.690428,1.0,0.216548,0.406239,0.674355,0.118593,0.100127,0.045652,0.112418,0.17427
Blockierung der Förderschnecke,0.418877,0.395861,0.216548,1.0,0.372174,0.25657,0.161816,0.111119,0.134641,0.244183,0.238573
Überhitzung durch mangelnde Kühlmittelzirkulation,0.303103,0.311964,0.406239,0.372174,1.0,0.287111,0.118686,0.140934,0.104992,0.207168,0.147596
Überprüfung der Hydraulik,0.335572,0.664567,0.674355,0.25657,0.287111,1.0,0.142494,0.028539,0.014706,0.132489,0.233765
Ich gehe spazieren,0.291617,0.104605,0.118593,0.161816,0.118686,0.142494,1.0,0.429125,0.55241,0.713886,0.528205
Heute um zwölf war ich unterwegs,0.269322,0.123543,0.100127,0.111119,0.140934,0.028539,0.429125,1.0,0.388809,0.457369,0.408289
Ich gehe mit dem Hund raus,0.249258,0.098958,0.045652,0.134641,0.104992,0.014706,0.55241,0.388809,1.0,0.533601,0.460647
Ich laufe im Park,0.248138,0.06827,0.112418,0.244183,0.207168,0.132489,0.713886,0.457369,0.533601,1.0,0.632991


In [19]:
# without ONNX
df_jinaai_model

Unnamed: 0,Ölleckage durch undichten Ölsumpf,Überprüfung der Schwingungsdämpfer,Überprüfung der Kühlmittelsysteme,Blockierung der Förderschnecke,Überhitzung durch mangelnde Kühlmittelzirkulation,Überprüfung der Hydraulik,Ich gehe spazieren,Heute um zwölf war ich unterwegs,Ich gehe mit dem Hund raus,Ich laufe im Park,Ich laufe im Pakr
Ölleckage durch undichten Ölsumpf,1.0,0.067791,0.160749,0.179933,0.210402,0.298898,0.114172,0.028209,0.024582,0.013934,0.107545
Überprüfung der Schwingungsdämpfer,0.067791,1.0,0.536279,0.220822,0.143585,0.591945,-0.103104,-0.096927,-0.102214,-0.088568,-0.005696
Überprüfung der Kühlmittelsysteme,0.160749,0.536279,1.0,0.058088,0.47766,0.518238,-0.054177,-0.041462,-0.132903,-0.010588,-0.007899
Blockierung der Förderschnecke,0.179933,0.220822,0.058088,1.0,0.126646,0.076466,-0.021669,-0.005974,-0.00759,0.028118,0.035968
Überhitzung durch mangelnde Kühlmittelzirkulation,0.210402,0.143585,0.47766,0.126646,1.0,0.148309,-0.052826,-0.079253,-0.090977,0.01693,-0.007476
Überprüfung der Hydraulik,0.298898,0.591945,0.518238,0.076466,0.148309,1.0,-0.066718,-0.098302,-0.10819,-0.046841,0.014736
Ich gehe spazieren,0.114172,-0.103104,-0.054177,-0.021669,-0.052826,-0.066718,1.0,0.313149,0.522301,0.511742,0.57306
Heute um zwölf war ich unterwegs,0.028209,-0.096927,-0.041462,-0.005974,-0.079253,-0.098302,0.313149,1.0,0.265599,0.23865,0.338099
Ich gehe mit dem Hund raus,0.024582,-0.102214,-0.132903,-0.00759,-0.090977,-0.10819,0.522301,0.265599,1.0,0.31389,0.369566
Ich laufe im Park,0.013934,-0.088568,-0.010588,0.028118,0.01693,-0.046841,0.511742,0.23865,0.31389,1.0,0.543645


In [93]:
from pathlib import Path

In [94]:
data_pth = Path(r'A:\Arbeitsaufgaben\lang-data\in\02_202307\Export4.csv')

In [95]:
assert data_pth.exists()

In [96]:
from lang_main.pipelines import predefined
from lang_main.analysis import preprocessing as preproc

In [97]:
pipe_target_feat = predefined.build_base_target_feature_pipe()

In [98]:
ret = pipe_target_feat.run(starting_values=(data_pth,))

2025-01-15 15:29:13 +0000 | lang_main:base:INFO | Starting pipeline >>Target_Feature<<...
INFO:lang_main.pipelines:Starting pipeline >>Target_Feature<<...
2025-01-15 15:29:14 +0000 | lang_main:preprocessing:INFO | Loaded dataset successfully.
INFO:lang_main.analysis.preprocessing:Loaded dataset successfully.
2025-01-15 15:29:14 +0000 | lang_main:preprocessing:INFO | Dataset properties: number of entries: 129020, number of features 20
INFO:lang_main.analysis.preprocessing:Dataset properties: number of entries: 129020, number of features 20
2025-01-15 15:29:14 +0000 | lang_main:preprocessing:INFO | Number of duplicates over all features: 84
INFO:lang_main.analysis.preprocessing:Number of duplicates over all features: 84
2025-01-15 15:29:14 +0000 | lang_main:preprocessing:INFO | Number of duplicates over subset >>['VorgangsID', 'ObjektID']<<: 725
INFO:lang_main.analysis.preprocessing:Number of duplicates over subset >>['VorgangsID', 'ObjektID']<<: 725
2025-01-15 15:29:14 +0000 | lang_main

In [99]:
df = ret[0]

In [100]:
df

Unnamed: 0,batched_idxs,entry,len,num_occur,assoc_obj_ids,num_assoc_obj_ids
162,"[232, 241, 242, 244, 247, 249, 268, 269, 289, ...",Tägliche Wartungstätigkeiten nach Vorgabe des ...,66,92592,"[0, 17, 41, 42, 43, 44, 45, 46, 47, 51, 52, 53...",206
33,"[37, 50, 57, 61, 129, 245, 246, 266, 353, 378,...",Wöchentliche Sichtkontrolle / Reinigung,39,1654,"[301, 304, 305, 313, 314, 331, 332, 510, 511, ...",18
131,"[179, 196, 216, 350, 355, 408, 426, 427, 428, ...",Tägliche Überprüfung der Ölabscheider,37,1616,"[0, 970, 2134, 2137]",4
160,"[224, 276, 277, 278, 279, 280, 281, 282, 283, ...",Wöchentliche Kontrolle der WC-Anlagen,37,1265,"[1352, 1353, 1354, 1684, 1685, 1686, 1687, 168...",11
140,"[191, 192, 194, 243, 248, 254, 296, 300, 302, ...",Halbjährliche Kontrolle des Stabbreithalters,44,687,"[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 6...",166
...,...,...,...,...,...,...
3721,[48008],AS 64,5,1,[1139],1
5085,[79568],Triax,5,1,[250],1
6280,[116518],fehlt,5,1,[1662],1
3733,[48167],95,2,1,[1139],1


In [101]:
cropped = df.iloc[:1000]

In [102]:
entries = tuple(cropped['entry'])

In [103]:
# adaption to new model
query = 'query: Retrieve semantically similar text: '
new_entries = []
for doc in entries:
    new_doc = 'query: ' + doc
    new_entries.append(new_doc)
new_entries[0]

'query: Tägliche Wartungstätigkeiten nach Vorgabe des Maschinenherstellers'

In [121]:
dimensions = 1024
model = SentenceTransformer(STFRModelTypes.E5_BASE_STS_EN_DE)

In [119]:
model = model_quant

In [122]:
%%timeit
#model = load_models(model_name_new)
embds = model.encode(entries, convert_to_numpy=False, convert_to_tensor=True)
sims = model.similarity(embds, embds).numpy()
sims.shape

52.8 s ± 1.09 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [120]:
%%timeit
#model = load_models(model_name_new)
embds = model.encode(entries, convert_to_numpy=False, convert_to_tensor=True)
sims = model.similarity(embds, embds).numpy()
sims.shape

44.3 s ± 490 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Test base model and alternative

In [157]:
base_model_res = preproc.merge_similarity_duplicates(cropped, model_ref, cos_sim_threshold=0.8)

2025-01-09 12:33:23 +0000 | lang_main:preprocessing:INFO | Start merging of similarity candidates...
2025-01-09 12:34:04 +0000 | lang_main:preprocessing:INFO | Similarity candidates merged successfully.


In [159]:
base_model_res = base_model_res[0]

In [160]:
len(base_model_res)

707

In [2]:
THRESHOLD = 0.88

In [164]:
base_model_alt = preproc.merge_similarity_duplicates(cropped, model_alt, cos_sim_threshold=THRESHOLD)

2025-01-09 12:40:55 +0000 | lang_main:preprocessing:INFO | Start merging of similarity candidates...
2025-01-09 12:43:22 +0000 | lang_main:preprocessing:INFO | Similarity candidates merged successfully.


In [165]:
base_model_alt = base_model_alt[0]

In [166]:
len(base_model_alt)

425

- paraphrase-multilingual-mpnet-base-v2
- paraphrase-multilingual-MiniLM-L12-v2

In [41]:
from sentence_transformers.util import cos_sim

In [42]:
import torch
from transformers import AutoModel
from numpy.linalg import norm



In [48]:
model = load_models(STFRModelTypes.JINAAI_BASE_DE_V2, trust_remote=True, use_onnx=True)
if model.max_seq_length > 1024:
    model.max_seq_length = 1024

In [49]:
embeddings = model.encode(entries)

In [51]:
sims, duration = benchmark_sims(model, entries)

In [52]:
duration

54.76219579999997

In [90]:
import numpy as np

In [112]:
THRESHOLD = 0.85
sim = sims.copy()

In [113]:
np.fill_diagonal(sim, 0)
sim = np.triu(sim)
#arr = np.where(sim > THRESHOLD, sim, 0)

In [114]:
np.count_nonzero(sim)

499500

In [115]:
idx = np.argwhere(sim >= THRESHOLD)

In [116]:
idx[0]

array([ 1, 15])

In [117]:
count = 0
for entry in idx:
    if count == 30:
        break
    txt1 = entries[entry[0]]
    txt2 = entries[entry[1]]
    value = sim[entry[0],entry[1]]
    print(f'Text pair with >>{value}<<:\n{txt1}\n---\n{txt2}\n\n')

    count += 1

Text pair with >>0.9210827350616455<<:
Wöchentliche Sichtkontrolle / Reinigung
---
Wöchentliche Sichtprüfung / Reinigung


Text pair with >>0.8897931575775146<<:
Wöchentliche Sichtkontrolle / Reinigung
---
Monatliche Sichtkontrolle / Reinigung


Text pair with >>0.8792937397956848<<:
Prüfung von: - Scharniere - Dichtung - Schließvorrichtung - Schloß - Beschlag - allgemeine Funktion - Schmierung - Festhaltevorrichtung
---
Monatliche Prüfung von: - Scharniere - Dichtung - Schließvorrichtung - Schloß - Beschlag - allgemeine Funktion - Schmierung - Festhaltevorrichtung


Text pair with >>0.9156877994537354<<:
Prüfung von: - Scharniere - Dichtung - Schließvorrichtung - Schloß - Beschlag - allgemeine Funktion - Schmierung - Festhaltevorrichtung
---
Prüfung von: Hr. Förster - Scharniere - Dichtung - Schließvorrichtung - Schloß - Beschlag - allgemeine Funktion - Schmierung - Festhaltevorrichtung


Text pair with >>0.9321640133857727<<:
Prüfung von: - Scharniere - Dichtung - Schließvorrichtung 

In [None]:
docs = [
    'Tägliche Wartungstätigkeiten nach Vorgabe des Maschinenherstellers',
    'Monatliche Prüfung von: - Scharniere - Dichtung - Schließvorrichtung - Schloß - Beschlag - allgemeine Funktion - Schmierung - Festhaltevorrichtung',
]

In [293]:
embds = model_alt.encode(docs, normalize_embeddings=True)
sims_t = model_alt.similarity(embds, embds).numpy()
sims_t.shape

(2, 2)

In [294]:
sims_t

array([[0.99999976, 0.7587665 ],
       [0.75876653, 1.0000002 ]], dtype=float32)