STRF for similarity duplicates, time analysis pipeline, enhanced config

This commit is contained in:
Florian Förster
2024-05-29 16:34:31 +02:00
parent 5d2c97165a
commit bb987e2108
30 changed files with 1875 additions and 693 deletions

View File

@@ -1,28 +1,42 @@
from typing import cast
from pathlib import Path
import pandas as pd
import plotly.express as px
from dash import (
Dash,
html,
dcc,
callback,
Output,
Input,
Output,
State,
callback,
dash_table,
dcc,
html,
)
import plotly.express as px
import pandas as pd
from lang_main import load_pickle
from lang_main.types import ObjectID, TimelineCandidates
from pandas import DataFrame
from lang_main import load_pickle
from lang_main.types import TimelineCandidates, ObjectID
#df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
# ** data
data = cast(DataFrame, load_pickle('./data.pkl'))
cands = cast(TimelineCandidates, load_pickle('./map_candidates.pkl'))
texts = cast(dict[ObjectID, str], load_pickle('./map_texts.pkl'))
p_df = Path(r'.\test-notebooks\dashboard\Pipe-TargetFeature_Step-3_remove_NA.pkl')
p_tl = Path(
r'.\test-notebooks\dashboard\Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl'
)
ret = cast(DataFrame, load_pickle(p_df))
data = ret[0]
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
cands = ret[0]
texts = ret[1]
# p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
# data = cast(DataFrame, load_pickle(p_df))
# cands = cast(TimelineCandidates, load_pickle(p_cands))
# texts = cast(dict[ObjectID, str], load_pickle(p_map))
table_feats = [
'ErstellungsDatum',
'ErledigungsDatum',
@@ -52,25 +66,28 @@ hover_data = {
app = Dash(prevent_initial_callbacks=True)
app.layout = [
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign':'center'}),
html.Div(children=[
html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
dcc.Dropdown(
list(cands.keys()),
id='dropdown-selection',
placeholder="ObjektID auswählen...",
)
]),
html.Div(children=[
html.H3(id='object_text'),
dcc.Dropdown(id='choice-candidates'),
dcc.Graph(id='graph-output'),
]),
html.Div(children=[
dash_table.DataTable(id='table-candidates')
]),
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
html.Div(
children=[
html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
dcc.Dropdown(
list(cands.keys()),
id='dropdown-selection',
placeholder='ObjektID auswählen...',
),
]
),
html.Div(
children=[
html.H3(id='object_text'),
dcc.Dropdown(id='choice-candidates'),
dcc.Graph(id='graph-output'),
]
),
html.Div(children=[dash_table.DataTable(id='table-candidates')]),
]
@callback(
Output('object_text', 'children'),
Input('dropdown-selection', 'value'),
@@ -82,6 +99,7 @@ def update_obj_text(obj_id):
headline = f'HObjektText: {obj_text}'
return headline
@callback(
Output('choice-candidates', 'options'),
Input('dropdown-selection', 'value'),
@@ -90,9 +108,10 @@ def update_obj_text(obj_id):
def update_choice_candidates(obj_id):
obj_id = int(obj_id)
cands_obj_id = cands[obj_id]
choices = list(range(1, len(cands_obj_id)+1))
choices = list(range(1, len(cands_obj_id) + 1))
return choices
@callback(
Output('graph-output', 'figure'),
Input('choice-candidates', 'value'),
@@ -106,7 +125,7 @@ def update_timeline(index, obj_id):
title = f'HObjektText: {obj_text}'
# cands
cands_obj_id = cands[obj_id]
cands_choice = cands_obj_id[int(index)-1]
cands_choice = cands_obj_id[int(index) - 1]
# data
df = data.loc[list(cands_choice)].sort_index()
# figure
@@ -117,22 +136,18 @@ def update_timeline(index, obj_id):
title=title,
hover_data=hover_data,
)
fig.update_traces(
mode='markers+lines',
marker=markers,
marker_symbol='diamond'
)
fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
fig.update_xaxes(
tickformat="%B\n%Y",
tickformat='%B\n%Y',
rangeslider_visible=True,
)
fig.update_yaxes(type='category')
fig.update_layout(hovermode="x unified")
fig.update_layout(hovermode='x unified')
return fig
@callback(
[Output('table-candidates', 'data'),
Output('table-candidates', 'columns')],
[Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
Input('choice-candidates', 'value'),
State('dropdown-selection', 'value'),
prevent_initial_call=True,
@@ -141,19 +156,20 @@ def update_table_candidates(index, obj_id):
obj_id = int(obj_id)
# cands
cands_obj_id = cands[obj_id]
cands_choice = cands_obj_id[int(index)-1]
cands_choice = cands_obj_id[int(index) - 1]
# data
df = data.loc[list(cands_choice)].sort_index()
df = (df
.filter(items=table_feats, axis=1)
.sort_values(by='ErstellungsDatum', ascending=True))
cols = [{"name": i, "id": i} for i in df.columns]
df = df.filter(items=table_feats, axis=1).sort_values(
by='ErstellungsDatum', ascending=True
)
cols = [{'name': i, 'id': i} for i in df.columns]
# convert dates to strings
for col in table_feats_dates:
df[col] = df[col].dt.strftime(r'%Y-%m-%d')
table_data = df.to_dict('records')
return table_data, cols
if __name__ == '__main__':
app.run(debug=True)
app.run(debug=True)

View File

@@ -0,0 +1,56 @@
# lang_main: Config file
[paths]
inputs = './inputs/'
results = './results/test_new2/'
dataset = './01_2_Rohdaten_neu/Export4.csv'
#results = './results/Export7/'
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
#results = './results/Export7_trunc/'
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
[control]
preprocessing = true
preprocessing_skip = false
token_analysis = false
token_analysis_skip = false
graph_postprocessing = false
graph_postprocessing_skip = false
time_analysis = false
time_analysis_skip = false
#[export_filenames]
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
[preprocess]
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
date_cols = [
"VorgangsDatum",
"ErledigungsDatum",
"Arbeitsbeginn",
"ErstellungsDatum",
]
threshold_amount_characters = 5
threshold_similarity = 0.8
[graph_postprocessing]
threshold_edge_weight = 150
[time_analysis.uniqueness]
threshold_unique_texts = 4
criterion_feature = 'HObjektText'
feature_name_obj_id = 'ObjektID'
[time_analysis.model_input]
input_features = [
'VorgangsTypName',
'VorgangsArtText',
'VorgangsBeschreibung',
]
activity_feature = 'VorgangsTypName'
activity_types = [
'Reparaturauftrag (Portal)',
'Störungsmeldung',
]
threshold_num_acitivities = 1
threshold_similarity = 0.8