STRF for similarity duplicates, time analysis pipeline, enhanced config

2024-05-29 16:34:31 +02:00
parent 5d2c97165a
commit bb987e2108
30 changed files with 1875 additions and 693 deletions
--- a/test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
+++ b/test-notebooks/dashboard/Pipe-TargetFeature_Step-3_remove_NA.pkl
--- a/test-notebooks/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
+++ b/test-notebooks/dashboard/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
--- a/test-notebooks/dashboard/app.py
+++ b/test-notebooks/dashboard/app.py
@@ -1,28 +1,42 @@
 from typing import cast
+from pathlib import Path

+import pandas as pd
+import plotly.express as px
 from dash import (
    Dash,
-    html,
-    dcc,
-    callback,
-    Output,
    Input,
+    Output,
    State,
+    callback,
    dash_table,
+    dcc,
+    html,
 )
-import plotly.express as px
-import pandas as pd
+from lang_main import load_pickle
+from lang_main.types import ObjectID, TimelineCandidates
 from pandas import DataFrame

-from lang_main import load_pickle
-from lang_main.types import TimelineCandidates, ObjectID
-
-#df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
+# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')

 # ** data
-data = cast(DataFrame, load_pickle('./data.pkl'))
-cands = cast(TimelineCandidates, load_pickle('./map_candidates.pkl'))
-texts = cast(dict[ObjectID, str], load_pickle('./map_texts.pkl'))
+p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
+p_tl = Path(
+    r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
+)
+ret = cast(DataFrame, load_pickle(p_df))
+data = ret[0]
+ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
+cands = ret[0]
+texts = ret[1]
+
+# p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
+# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
+# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
+# data = cast(DataFrame, load_pickle(p_df))
+# cands = cast(TimelineCandidates, load_pickle(p_cands))
+# texts = cast(dict[ObjectID, str], load_pickle(p_map))
+
 table_feats = [
    'ErstellungsDatum',
    'ErledigungsDatum',
@@ -52,25 +66,28 @@ hover_data = {
 app = Dash(prevent_initial_callbacks=True)

 app.layout = [
-    html.H1(children='Demo Zeitreihenanalyse', style={'textAlign':'center'}),
-    html.Div(children=[
-        html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
-        dcc.Dropdown(
-            list(cands.keys()),
-            id='dropdown-selection',
-            placeholder="ObjektID auswählen...",
-        )
-    ]),
-    html.Div(children=[
-        html.H3(id='object_text'),
-        dcc.Dropdown(id='choice-candidates'),
-        dcc.Graph(id='graph-output'),
-    ]),
-    html.Div(children=[
-        dash_table.DataTable(id='table-candidates')
-    ]),
+    html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
+    html.Div(
+        children=[
+            html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
+            dcc.Dropdown(
+                list(cands.keys()),
+                id='dropdown-selection',
+                placeholder='ObjektID auswählen...',
+            ),
+        ]
+    ),
+    html.Div(
+        children=[
+            html.H3(id='object_text'),
+            dcc.Dropdown(id='choice-candidates'),
+            dcc.Graph(id='graph-output'),
+        ]
+    ),
+    html.Div(children=[dash_table.DataTable(id='table-candidates')]),
 ]

+
@callback(
    Output('object_text', 'children'),
    Input('dropdown-selection', 'value'),
@@ -82,6 +99,7 @@ def update_obj_text(obj_id):
    headline = f'HObjektText: {obj_text}'
    return headline

+
@callback(
    Output('choice-candidates', 'options'),
    Input('dropdown-selection', 'value'),
@@ -90,9 +108,10 @@ def update_obj_text(obj_id):
 def update_choice_candidates(obj_id):
    obj_id = int(obj_id)
    cands_obj_id = cands[obj_id]
-    choices = list(range(1, len(cands_obj_id)+1))
+    choices = list(range(1, len(cands_obj_id) + 1))
    return choices

+
@callback(
    Output('graph-output', 'figure'),
    Input('choice-candidates', 'value'),
@@ -106,7 +125,7 @@ def update_timeline(index, obj_id):
    title = f'HObjektText: {obj_text}'
    # cands
    cands_obj_id = cands[obj_id]
-    cands_choice = cands_obj_id[int(index)-1]
+    cands_choice = cands_obj_id[int(index) - 1]
    # data
    df = data.loc[list(cands_choice)].sort_index()
    # figure
@@ -117,22 +136,18 @@ def update_timeline(index, obj_id):
        title=title,
        hover_data=hover_data,
    )
-    fig.update_traces(
-        mode='markers+lines',
-        marker=markers,
-        marker_symbol='diamond'
-    )
+    fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
    fig.update_xaxes(
-        tickformat="%B\n%Y",
+        tickformat='%B\n%Y',
        rangeslider_visible=True,
    )
    fig.update_yaxes(type='category')
-    fig.update_layout(hovermode="x unified")
+    fig.update_layout(hovermode='x unified')
    return fig

+
@callback(
-    [Output('table-candidates', 'data'),
-     Output('table-candidates', 'columns')],
+    [Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
    Input('choice-candidates', 'value'),
    State('dropdown-selection', 'value'),
    prevent_initial_call=True,
@@ -141,19 +156,20 @@ def update_table_candidates(index, obj_id):
    obj_id = int(obj_id)
    # cands
    cands_obj_id = cands[obj_id]
-    cands_choice = cands_obj_id[int(index)-1]
+    cands_choice = cands_obj_id[int(index) - 1]
    # data
    df = data.loc[list(cands_choice)].sort_index()
-    df = (df
-          .filter(items=table_feats, axis=1)
-          .sort_values(by='ErstellungsDatum', ascending=True))
-    cols = [{"name": i, "id": i} for i in df.columns]
+    df = df.filter(items=table_feats, axis=1).sort_values(
+        by='ErstellungsDatum', ascending=True
+    )
+    cols = [{'name': i, 'id': i} for i in df.columns]
    # convert dates to strings
    for col in table_feats_dates:
        df[col] = df[col].dt.strftime(r'%Y-%m-%d')
-    
+
    table_data = df.to_dict('records')
    return table_data, cols

+
 if __name__ == '__main__':
-    app.run(debug=True)
+    app.run(debug=True)
--- a/test-notebooks/dashboard/archive/data.pkl
+++ b/test-notebooks/dashboard/archive/data.pkl
--- a/test-notebooks/dashboard/archive/map_candidates.pkl
+++ b/test-notebooks/dashboard/archive/map_candidates.pkl
--- a/test-notebooks/dashboard/archive/map_texts.pkl
+++ b/test-notebooks/dashboard/archive/map_texts.pkl
--- a/test-notebooks/dashboard/lang_main_config.toml
+++ b/test-notebooks/dashboard/lang_main_config.toml
@@ -0,0 +1,56 @@
+# lang_main: Config file
+
+[paths]
+inputs = './inputs/'
+results = './results/test_new2/'
+dataset = './01_2_Rohdaten_neu/Export4.csv'
+#results = './results/Export7/'
+#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
+#results = './results/Export7_trunc/'
+#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
+
+[control]
+preprocessing = true
+preprocessing_skip = false
+token_analysis = false
+token_analysis_skip = false
+graph_postprocessing = false
+graph_postprocessing_skip = false
+time_analysis = false
+time_analysis_skip = false
+
+#[export_filenames]
+#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
+
+[preprocess]
+filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
+date_cols = [
+    "VorgangsDatum", 
+    "ErledigungsDatum", 
+    "Arbeitsbeginn", 
+    "ErstellungsDatum",
+]
+threshold_amount_characters = 5
+threshold_similarity = 0.8
+
+[graph_postprocessing]
+threshold_edge_weight = 150
+
+[time_analysis.uniqueness]
+threshold_unique_texts = 4
+criterion_feature = 'HObjektText'
+feature_name_obj_id = 'ObjektID'
+
+[time_analysis.model_input]
+input_features = [
+    'VorgangsTypName',
+    'VorgangsArtText',
+    'VorgangsBeschreibung',
+]
+activity_feature = 'VorgangsTypName'
+activity_types = [
+    'Reparaturauftrag (Portal)',
+    'Störungsmeldung',
+]
+threshold_num_acitivities = 1
+threshold_similarity = 0.8
--- a/test-notebooks/dashboard/new/Pipe-TargetFeature_Step-3_remove_NA.pkl
+++ b/test-notebooks/dashboard/new/Pipe-TargetFeature_Step-3_remove_NA.pkl
--- a/test-notebooks/dashboard/new/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl
+++ b/test-notebooks/dashboard/new/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl