new pipeline management, proto graph display timeline
This commit is contained in:
parent
c2714b8060
commit
fb4437a3a2
8
pdm.lock
generated
8
pdm.lock
generated
@ -5,7 +5,7 @@
|
|||||||
groups = ["default", "notebooks", "trials"]
|
groups = ["default", "notebooks", "trials"]
|
||||||
strategy = ["cross_platform", "inherit_metadata"]
|
strategy = ["cross_platform", "inherit_metadata"]
|
||||||
lock_version = "4.4.1"
|
lock_version = "4.4.1"
|
||||||
content_hash = "sha256:8781981bde2786c60273cd73599f4ab6a388d0b435484d5ba0afa0656723dd98"
|
content_hash = "sha256:e00f157f833ee7615d96375c352e2caa6b4f6b50e5615ccbefa79446189594c7"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "annotated-types"
|
name = "annotated-types"
|
||||||
@ -2938,13 +2938,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "typing-extensions"
|
name = "typing-extensions"
|
||||||
version = "4.11.0"
|
version = "4.12.2"
|
||||||
requires_python = ">=3.8"
|
requires_python = ">=3.8"
|
||||||
summary = "Backported and Experimental Type Hints for Python 3.8+"
|
summary = "Backported and Experimental Type Hints for Python 3.8+"
|
||||||
groups = ["default", "notebooks", "trials"]
|
groups = ["default", "notebooks", "trials"]
|
||||||
files = [
|
files = [
|
||||||
{file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"},
|
{file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"},
|
||||||
{file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"},
|
{file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|||||||
@ -12,6 +12,7 @@ dependencies = [
|
|||||||
"sentence-transformers>=2.7.0",
|
"sentence-transformers>=2.7.0",
|
||||||
"numpy>=1.26.4",
|
"numpy>=1.26.4",
|
||||||
"pip>=24.0",
|
"pip>=24.0",
|
||||||
|
"typing-extensions>=4.12.2",
|
||||||
]
|
]
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
@ -48,3 +49,6 @@ skip-magic-trailing-comma = false
|
|||||||
|
|
||||||
[tool.ruff.lint]
|
[tool.ruff.lint]
|
||||||
select = ["E", "F", "I"]
|
select = ["E", "F", "I"]
|
||||||
|
|
||||||
|
[tool.ruff.lint.isort]
|
||||||
|
extra-standard-library = ["typing_extensions"]
|
||||||
@ -1,42 +1,44 @@
|
|||||||
import typing
|
import typing
|
||||||
import warnings
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
|
from pandas import DataFrame, Series
|
||||||
|
|
||||||
from lang_main.analysis.graphs import TokenGraph
|
from lang_main.analysis.graphs import TokenGraph
|
||||||
from lang_main.constants import (
|
from lang_main.constants import (
|
||||||
DO_GRAPH_POSTPROCESSING,
|
|
||||||
DO_PREPROCESSING,
|
|
||||||
DO_TIME_ANALYSIS,
|
|
||||||
DO_TOKEN_ANALYSIS,
|
|
||||||
INPUT_PATH_FOLDER,
|
|
||||||
PATH_TO_DATASET,
|
PATH_TO_DATASET,
|
||||||
SAVE_PATH_FOLDER,
|
SAVE_PATH_FOLDER,
|
||||||
SKIP_GRAPH_POSTPROCESSING,
|
SKIP_GRAPH_POSTPROCESSING,
|
||||||
SKIP_PREPROCESSING,
|
SKIP_PREPROCESSING,
|
||||||
SKIP_TIME_ANALYSIS,
|
SKIP_TIME_ANALYSIS,
|
||||||
SKIP_TOKEN_ANALYSIS,
|
SKIP_TOKEN_ANALYSIS,
|
||||||
THRESHOLD_AMOUNT_CHARACTERS,
|
|
||||||
THRESHOLD_EDGE_WEIGHT,
|
|
||||||
)
|
)
|
||||||
from lang_main.io import create_saving_folder, load_pickle
|
from lang_main.io import create_saving_folder, get_entry_point, load_pickle
|
||||||
|
from lang_main.pipelines.base import PipelineContainer
|
||||||
from lang_main.pipelines.predefined import (
|
from lang_main.pipelines.predefined import (
|
||||||
pipe_merge,
|
build_base_target_feature_pipe,
|
||||||
pipe_target_feat,
|
build_merge_duplicates_pipe,
|
||||||
pipe_timeline,
|
build_timeline_pipe,
|
||||||
pipe_token_analysis,
|
build_tk_graph_pipe,
|
||||||
|
build_tk_graph_post_pipe,
|
||||||
)
|
)
|
||||||
from lang_main.types import (
|
from lang_main.types import (
|
||||||
|
EntryPoints,
|
||||||
ObjectID,
|
ObjectID,
|
||||||
PandasIndex,
|
PandasIndex,
|
||||||
SpacyDoc,
|
SpacyDoc,
|
||||||
TimelineCandidates,
|
TimelineCandidates,
|
||||||
)
|
)
|
||||||
from pandas import DataFrame, Series
|
|
||||||
|
# ** build pipelines
|
||||||
|
pipe_merge = build_merge_duplicates_pipe()
|
||||||
|
pipe_target_feat = build_base_target_feature_pipe()
|
||||||
|
pipe_timeline = build_timeline_pipe()
|
||||||
|
pipe_token_analysis = build_tk_graph_pipe()
|
||||||
|
pipe_graph_postprocessing = build_tk_graph_post_pipe()
|
||||||
|
|
||||||
|
|
||||||
# ** processing pipeline
|
# ** preprocessing pipeline
|
||||||
def run_preprocessing() -> DataFrame:
|
def run_preprocessing() -> None:
|
||||||
create_saving_folder(
|
create_saving_folder(
|
||||||
saving_path_folder=SAVE_PATH_FOLDER,
|
saving_path_folder=SAVE_PATH_FOLDER,
|
||||||
overwrite_existing=False,
|
overwrite_existing=False,
|
||||||
@ -46,134 +48,69 @@ def run_preprocessing() -> DataFrame:
|
|||||||
tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))
|
tuple[DataFrame], pipe_target_feat.run(starting_values=(PATH_TO_DATASET,))
|
||||||
)
|
)
|
||||||
target_feat_data = ret[0]
|
target_feat_data = ret[0]
|
||||||
# only entries with more than threshold amount of characters
|
_ = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(target_feat_data,)))
|
||||||
data_filter = typing.cast(Series, (target_feat_data['len'] > THRESHOLD_AMOUNT_CHARACTERS))
|
|
||||||
subset_data = target_feat_data.loc[data_filter].copy()
|
|
||||||
# merge duplicates, results saved separately
|
|
||||||
ret = typing.cast(tuple[DataFrame], pipe_merge.run(starting_values=(subset_data,)))
|
|
||||||
preprocessed_data = ret[0]
|
|
||||||
|
|
||||||
return preprocessed_data
|
|
||||||
|
|
||||||
|
|
||||||
def run_token_analysis(
|
# ** token analysis
|
||||||
preprocessed_data: DataFrame,
|
def run_token_analysis() -> None:
|
||||||
) -> tuple[TokenGraph, dict[PandasIndex, SpacyDoc]]:
|
# load entry point
|
||||||
|
entry_point_path = get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TOKEN_ANALYSIS)
|
||||||
|
loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
|
||||||
|
preprocessed_data = loaded_results[0]
|
||||||
# build token graph
|
# build token graph
|
||||||
(tk_graph, docs_mapping) = typing.cast(
|
(tk_graph, docs_mapping) = typing.cast(
|
||||||
tuple[TokenGraph, dict[PandasIndex, SpacyDoc]],
|
tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
|
||||||
pipe_token_analysis.run(starting_values=(preprocessed_data,)),
|
pipe_token_analysis.run(starting_values=(preprocessed_data,)),
|
||||||
)
|
)
|
||||||
tk_graph.save_graph(SAVE_PATH_FOLDER, directed=False)
|
tk_graph.to_GraphML(SAVE_PATH_FOLDER, filename='TokenGraph', directed=False)
|
||||||
tk_graph.to_pickle(SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph')
|
|
||||||
|
|
||||||
return tk_graph, docs_mapping
|
|
||||||
|
|
||||||
|
|
||||||
def run_graph_postprocessing(
|
def run_graph_postprocessing() -> None:
|
||||||
tk_graph: TokenGraph,
|
# load entry point
|
||||||
) -> TokenGraph:
|
entry_point_path = get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TK_GRAPH_POST)
|
||||||
|
loaded_results = cast(
|
||||||
|
tuple[TokenGraph, dict[PandasIndex, SpacyDoc] | None],
|
||||||
|
load_pickle(entry_point_path),
|
||||||
|
)
|
||||||
|
tk_graph = loaded_results[0]
|
||||||
# filter graph by edge weight and remove single nodes (no connection)
|
# filter graph by edge weight and remove single nodes (no connection)
|
||||||
tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT)
|
ret = cast(tuple[TokenGraph], pipe_graph_postprocessing.run(starting_values=(tk_graph,)))
|
||||||
tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1)
|
tk_graph_filtered = ret[0]
|
||||||
tk_graph_filtered.save_graph(
|
# tk_graph_filtered = tk_graph.filter_by_edge_weight(THRESHOLD_EDGE_WEIGHT, None)
|
||||||
|
# tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1, None)
|
||||||
|
tk_graph_filtered.to_GraphML(
|
||||||
SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False
|
SAVE_PATH_FOLDER, filename='TokenGraph-filtered', directed=False
|
||||||
)
|
)
|
||||||
tk_graph_filtered.to_pickle(
|
|
||||||
SAVE_PATH_FOLDER, filename=f'{pipe_token_analysis.name}-TokenGraph-filtered'
|
|
||||||
)
|
|
||||||
|
|
||||||
return tk_graph_filtered
|
|
||||||
|
|
||||||
|
|
||||||
def run_time_analysis() -> tuple[TimelineCandidates, dict[ObjectID, str]]:
|
# ** time analysis
|
||||||
filename = 'without_nan'
|
def run_time_analysis() -> None:
|
||||||
loading_path = INPUT_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
|
# load entry point
|
||||||
verify_path(loading_path)
|
entry_point_path = get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE)
|
||||||
ret = load_pickle(loading_path)
|
loaded_results = cast(tuple[DataFrame], load_pickle(entry_point_path))
|
||||||
preprocessed_data = ret[0]
|
preprocessed_data = loaded_results[0]
|
||||||
|
|
||||||
ret = cast(
|
_ = cast(
|
||||||
tuple[TimelineCandidates, dict[ObjectID, str]],
|
tuple[TimelineCandidates, dict[ObjectID, str]],
|
||||||
pipe_timeline.run(starting_values=(preprocessed_data,)),
|
pipe_timeline.run(starting_values=(preprocessed_data,)),
|
||||||
)
|
)
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
def verify_path(
|
def build_pipeline_container() -> PipelineContainer:
|
||||||
loading_path: Path,
|
container = PipelineContainer(
|
||||||
) -> None:
|
name='Pipeline-Container-Base', working_dir=SAVE_PATH_FOLDER
|
||||||
if not loading_path.exists():
|
)
|
||||||
raise FileNotFoundError(f'Could not load results. File not found: {loading_path}')
|
container.add(run_preprocessing, skip=SKIP_PREPROCESSING)
|
||||||
|
container.add(run_token_analysis, skip=SKIP_TOKEN_ANALYSIS)
|
||||||
|
container.add(run_graph_postprocessing, skip=SKIP_GRAPH_POSTPROCESSING)
|
||||||
|
container.add(run_time_analysis, skip=SKIP_TIME_ANALYSIS)
|
||||||
|
|
||||||
|
return container
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
pre_step_skipped: bool = False
|
procedure = build_pipeline_container()
|
||||||
# ** preprocess
|
procedure.run()
|
||||||
if DO_PREPROCESSING and not SKIP_PREPROCESSING:
|
|
||||||
preprocessed_data = run_preprocessing()
|
|
||||||
elif not SKIP_PREPROCESSING:
|
|
||||||
# !! hardcoded result filenames
|
|
||||||
target_pattern: str = r'*Pipe-Merge_Duplicates_Step-1*'
|
|
||||||
loading_path = list(SAVE_PATH_FOLDER.glob(target_pattern))[0]
|
|
||||||
verify_path(loading_path)
|
|
||||||
ret = typing.cast(tuple[DataFrame], load_pickle(loading_path))
|
|
||||||
preprocessed_data = ret[0]
|
|
||||||
else:
|
|
||||||
pre_step_skipped = True
|
|
||||||
warnings.warn('No preprocessing action selected. Skipped.')
|
|
||||||
# sys.exit(0)
|
|
||||||
# ** token analysis
|
|
||||||
if DO_TOKEN_ANALYSIS and not SKIP_TOKEN_ANALYSIS:
|
|
||||||
if pre_step_skipped:
|
|
||||||
raise RuntimeError(
|
|
||||||
'Preprocessing step skipped. Token analysis cannot be performed.'
|
|
||||||
)
|
|
||||||
preprocessed_data_trunc = typing.cast(
|
|
||||||
DataFrame, preprocessed_data[['batched_idxs', 'entry', 'num_occur']].copy()
|
|
||||||
) # type: ignore
|
|
||||||
tk_graph, docs_mapping = run_token_analysis(preprocessed_data_trunc)
|
|
||||||
elif not SKIP_TOKEN_ANALYSIS:
|
|
||||||
# !! hardcoded result filenames
|
|
||||||
# whole graph
|
|
||||||
filename: str = f'{pipe_token_analysis.name}-TokenGraph'
|
|
||||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
|
|
||||||
verify_path(loading_path)
|
|
||||||
# tk_graph = typing.cast(TokenGraph, load_pickle(loading_path))
|
|
||||||
tk_graph = TokenGraph.from_pickle(loading_path)
|
|
||||||
pre_step_skipped = False
|
|
||||||
else:
|
|
||||||
pre_step_skipped = True
|
|
||||||
warnings.warn('No token analysis action selected. Skipped.')
|
|
||||||
# ** graph postprocessing
|
|
||||||
if DO_GRAPH_POSTPROCESSING and not SKIP_GRAPH_POSTPROCESSING:
|
|
||||||
if pre_step_skipped:
|
|
||||||
raise RuntimeError(
|
|
||||||
(
|
|
||||||
'Preprocessing or token analysis step skipped. '
|
|
||||||
'Graph postprocessing cannot be performed.'
|
|
||||||
)
|
|
||||||
)
|
|
||||||
tk_graph_filtered = run_graph_postprocessing(tk_graph)
|
|
||||||
elif not SKIP_GRAPH_POSTPROCESSING:
|
|
||||||
# !! hardcoded result filenames
|
|
||||||
# filtered graph
|
|
||||||
filename: str = f'{pipe_token_analysis.name}-TokenGraph-filtered'
|
|
||||||
loading_path = SAVE_PATH_FOLDER.joinpath(filename).with_suffix('.pkl')
|
|
||||||
verify_path(loading_path)
|
|
||||||
# tk_graph_filtered = typing.cast(TokenGraph, load_pickle(loading_path))
|
|
||||||
tk_graph_filtered = TokenGraph.from_pickle(loading_path)
|
|
||||||
pre_step_skipped = False
|
|
||||||
else:
|
|
||||||
warnings.warn('No graph postprocessing action selected. Skipped.')
|
|
||||||
# ** time analysis
|
|
||||||
if DO_TIME_ANALYSIS and not SKIP_TIME_ANALYSIS:
|
|
||||||
# no check for fails, runs separately
|
|
||||||
ret = run_time_analysis()
|
|
||||||
elif not SKIP_TIME_ANALYSIS:
|
|
||||||
...
|
|
||||||
else:
|
|
||||||
warnings.warn('No time analysis action selected. Skipped.')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
@ -1,190 +0,0 @@
|
|||||||
import time
|
|
||||||
import webbrowser
|
|
||||||
from pathlib import Path
|
|
||||||
from threading import Thread
|
|
||||||
from typing import cast
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import plotly.express as px
|
|
||||||
from dash import (
|
|
||||||
Dash,
|
|
||||||
Input,
|
|
||||||
Output,
|
|
||||||
State,
|
|
||||||
callback,
|
|
||||||
dash_table,
|
|
||||||
dcc,
|
|
||||||
html,
|
|
||||||
)
|
|
||||||
from lang_main.io import load_pickle
|
|
||||||
from lang_main.types import ObjectID, TimelineCandidates
|
|
||||||
from pandas import DataFrame
|
|
||||||
|
|
||||||
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
|
|
||||||
|
|
||||||
# ** data
|
|
||||||
p_df = Path(r'./Pipe-TargetFeature_Step-3_remove_NA.pkl').resolve()
|
|
||||||
p_tl = Path(r'/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl').resolve()
|
|
||||||
ret = cast(DataFrame, load_pickle(p_df))
|
|
||||||
data = ret[0]
|
|
||||||
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
|
|
||||||
cands = ret[0]
|
|
||||||
texts = ret[1]
|
|
||||||
|
|
||||||
# p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
|
|
||||||
# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
|
|
||||||
# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
|
|
||||||
# data = cast(DataFrame, load_pickle(p_df))
|
|
||||||
# cands = cast(TimelineCandidates, load_pickle(p_cands))
|
|
||||||
# texts = cast(dict[ObjectID, str], load_pickle(p_map))
|
|
||||||
|
|
||||||
table_feats = [
|
|
||||||
'ErstellungsDatum',
|
|
||||||
'ErledigungsDatum',
|
|
||||||
'VorgangsTypName',
|
|
||||||
'VorgangsBeschreibung',
|
|
||||||
]
|
|
||||||
table_feats_dates = [
|
|
||||||
'ErstellungsDatum',
|
|
||||||
'ErledigungsDatum',
|
|
||||||
]
|
|
||||||
|
|
||||||
# ** graph config
|
|
||||||
markers = {
|
|
||||||
'size': 12,
|
|
||||||
'color': 'yellow',
|
|
||||||
'line': {
|
|
||||||
'width': 2,
|
|
||||||
'color': 'red',
|
|
||||||
},
|
|
||||||
}
|
|
||||||
hover_data = {
|
|
||||||
'ErstellungsDatum': '|%d.%m.%Y',
|
|
||||||
'VorgangsBeschreibung': True,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
app = Dash(prevent_initial_callbacks=True)
|
|
||||||
|
|
||||||
app.layout = [
|
|
||||||
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
|
|
||||||
html.Div(
|
|
||||||
children=[
|
|
||||||
html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
|
|
||||||
dcc.Dropdown(
|
|
||||||
list(cands.keys()),
|
|
||||||
id='dropdown-selection',
|
|
||||||
placeholder='ObjektID auswählen...',
|
|
||||||
),
|
|
||||||
]
|
|
||||||
),
|
|
||||||
html.Div(
|
|
||||||
children=[
|
|
||||||
html.H3(id='object_text'),
|
|
||||||
dcc.Dropdown(id='choice-candidates'),
|
|
||||||
dcc.Graph(id='graph-output'),
|
|
||||||
]
|
|
||||||
),
|
|
||||||
html.Div(children=[dash_table.DataTable(id='table-candidates')]),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@callback(
|
|
||||||
Output('object_text', 'children'),
|
|
||||||
Input('dropdown-selection', 'value'),
|
|
||||||
prevent_initial_call=True,
|
|
||||||
)
|
|
||||||
def update_obj_text(obj_id):
|
|
||||||
obj_id = int(obj_id)
|
|
||||||
obj_text = texts[obj_id]
|
|
||||||
headline = f'HObjektText: {obj_text}'
|
|
||||||
return headline
|
|
||||||
|
|
||||||
|
|
||||||
@callback(
|
|
||||||
Output('choice-candidates', 'options'),
|
|
||||||
Input('dropdown-selection', 'value'),
|
|
||||||
prevent_initial_call=True,
|
|
||||||
)
|
|
||||||
def update_choice_candidates(obj_id):
|
|
||||||
obj_id = int(obj_id)
|
|
||||||
cands_obj_id = cands[obj_id]
|
|
||||||
choices = list(range(1, len(cands_obj_id) + 1))
|
|
||||||
return choices
|
|
||||||
|
|
||||||
|
|
||||||
@callback(
|
|
||||||
Output('graph-output', 'figure'),
|
|
||||||
Input('choice-candidates', 'value'),
|
|
||||||
State('dropdown-selection', 'value'),
|
|
||||||
prevent_initial_call=True,
|
|
||||||
)
|
|
||||||
def update_timeline(index, obj_id):
|
|
||||||
obj_id = int(obj_id)
|
|
||||||
# title
|
|
||||||
obj_text = texts[obj_id]
|
|
||||||
title = f'HObjektText: {obj_text}'
|
|
||||||
# cands
|
|
||||||
cands_obj_id = cands[obj_id]
|
|
||||||
cands_choice = cands_obj_id[int(index) - 1]
|
|
||||||
# data
|
|
||||||
df = data.loc[list(cands_choice)].sort_index() # type: ignore
|
|
||||||
# figure
|
|
||||||
fig = px.line(
|
|
||||||
data_frame=df,
|
|
||||||
x='ErstellungsDatum',
|
|
||||||
y='ObjektID',
|
|
||||||
title=title,
|
|
||||||
hover_data=hover_data,
|
|
||||||
)
|
|
||||||
fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
|
|
||||||
fig.update_xaxes(
|
|
||||||
tickformat='%B\n%Y',
|
|
||||||
rangeslider_visible=True,
|
|
||||||
)
|
|
||||||
fig.update_yaxes(type='category')
|
|
||||||
fig.update_layout(hovermode='x unified')
|
|
||||||
return fig
|
|
||||||
|
|
||||||
|
|
||||||
@callback(
|
|
||||||
[Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
|
|
||||||
Input('choice-candidates', 'value'),
|
|
||||||
State('dropdown-selection', 'value'),
|
|
||||||
prevent_initial_call=True,
|
|
||||||
)
|
|
||||||
def update_table_candidates(index, obj_id):
|
|
||||||
obj_id = int(obj_id)
|
|
||||||
# cands
|
|
||||||
cands_obj_id = cands[obj_id]
|
|
||||||
cands_choice = cands_obj_id[int(index) - 1]
|
|
||||||
# data
|
|
||||||
df = data.loc[list(cands_choice)].sort_index() # type: ignore
|
|
||||||
df = df.filter(items=table_feats, axis=1).sort_values(
|
|
||||||
by='ErstellungsDatum', ascending=True
|
|
||||||
)
|
|
||||||
cols = [{'name': i, 'id': i} for i in df.columns]
|
|
||||||
# convert dates to strings
|
|
||||||
for col in table_feats_dates:
|
|
||||||
df[col] = df[col].dt.strftime(r'%Y-%m-%d')
|
|
||||||
|
|
||||||
table_data = df.to_dict('records')
|
|
||||||
return table_data, cols
|
|
||||||
|
|
||||||
|
|
||||||
def _start_webbrowser():
|
|
||||||
host = '127.0.0.1'
|
|
||||||
port = '8050'
|
|
||||||
adress = f'http://{host}:{port}/'
|
|
||||||
time.sleep(2)
|
|
||||||
webbrowser.open_new(adress)
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
|
|
||||||
webbrowser_thread.start()
|
|
||||||
app.run(debug=True)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
@ -1,9 +1,9 @@
|
|||||||
|
import copy
|
||||||
import time
|
import time
|
||||||
import webbrowser
|
import webbrowser
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from typing import cast
|
from typing import cast
|
||||||
import copy
|
|
||||||
|
|
||||||
import dash_cytoscape as cyto
|
import dash_cytoscape as cyto
|
||||||
from dash import Dash, Input, Output, State, dcc, html
|
from dash import Dash, Input, Output, State, dcc, html
|
||||||
@ -30,20 +30,20 @@ app = Dash(__name__, external_stylesheets=external_stylesheets)
|
|||||||
|
|
||||||
cose_layout = {
|
cose_layout = {
|
||||||
'name': 'cose',
|
'name': 'cose',
|
||||||
'nodeOverlap': 20,
|
'nodeOverlap': 500,
|
||||||
'refresh': 20,
|
'refresh': 20,
|
||||||
'fit': True,
|
'fit': True,
|
||||||
'padding': 30,
|
'padding': 20,
|
||||||
'randomize': True,
|
'randomize': False,
|
||||||
'componentSpacing': 40,
|
'componentSpacing': 1.2,
|
||||||
'nodeRepulsion': 2000,
|
'nodeRepulsion': 1000,
|
||||||
'edgeElasticity': 1000,
|
'edgeElasticity': 1000,
|
||||||
'idealEdgeLength': 100,
|
'idealEdgeLength': 100,
|
||||||
'nestingFactor': 1.2,
|
'nestingFactor': 1.2,
|
||||||
'gravity': 50,
|
'gravity': 50,
|
||||||
'numIter': 2000,
|
'numIter': 3000,
|
||||||
'initialTemp': 1000,
|
'initialTemp': 2000,
|
||||||
'coolingFactor': 0.95,
|
'coolingFactor': 0.7,
|
||||||
'minTemp': 1.0,
|
'minTemp': 1.0,
|
||||||
'nodeDimensionsIncludeLabels': True,
|
'nodeDimensionsIncludeLabels': True,
|
||||||
}
|
}
|
||||||
@ -108,9 +108,8 @@ my_stylesheet = [
|
|||||||
# {'selector': '.triangle', 'style': {'shape': 'triangle'}},
|
# {'selector': '.triangle', 'style': {'shape': 'triangle'}},
|
||||||
]
|
]
|
||||||
|
|
||||||
app.layout = html.Div(
|
layout = html.Div(
|
||||||
[
|
[
|
||||||
html.Button('Trigger JS Layout', id='test_js'),
|
|
||||||
html.Button('Trigger JS Weight', id='test_js_weight'),
|
html.Button('Trigger JS Weight', id='test_js_weight'),
|
||||||
html.Div(id='output'),
|
html.Div(id='output'),
|
||||||
html.Div(
|
html.Div(
|
||||||
@ -166,11 +165,13 @@ app.layout = html.Div(
|
|||||||
style={'width': '40%'},
|
style={'width': '40%'},
|
||||||
),
|
),
|
||||||
html.H3('Graph'),
|
html.H3('Graph'),
|
||||||
|
html.Button('Re-Layout', id='trigger_relayout'),
|
||||||
html.Div(
|
html.Div(
|
||||||
[
|
[
|
||||||
cyto.Cytoscape(
|
cyto.Cytoscape(
|
||||||
id='cytoscape-graph',
|
id='cytoscape-graph',
|
||||||
style={'width': '100%', 'height': '600px'},
|
style={'width': '100%', 'height': '600px'},
|
||||||
|
layout=cose_layout,
|
||||||
stylesheet=my_stylesheet,
|
stylesheet=my_stylesheet,
|
||||||
elements=cyto_data_base,
|
elements=cyto_data_base,
|
||||||
zoom=1,
|
zoom=1,
|
||||||
@ -192,6 +193,9 @@ app.layout = html.Div(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
app.layout = layout
|
||||||
|
|
||||||
|
|
||||||
@app.callback(
|
@app.callback(
|
||||||
Output('cytoscape-graph', 'layout', allow_duplicate=True),
|
Output('cytoscape-graph', 'layout', allow_duplicate=True),
|
||||||
Input('layout_choice', 'value'),
|
Input('layout_choice', 'value'),
|
||||||
@ -266,17 +270,17 @@ app.clientside_callback(
|
|||||||
"""
|
"""
|
||||||
function(n_clicks, layout) {
|
function(n_clicks, layout) {
|
||||||
layout.edgeElasticity = function(edge) {
|
layout.edgeElasticity = function(edge) {
|
||||||
return edge.data().weight * 4;
|
return edge.data().weight * 0.05;
|
||||||
};
|
};
|
||||||
layout.idealEdgeLength = function(edge) {
|
layout.idealEdgeLength = function(edge) {
|
||||||
return edge.data().weight * 0.8;
|
return edge.data().weight * 0.4;
|
||||||
};
|
};
|
||||||
cy.layout(layout).run();
|
cy.layout(layout).run();
|
||||||
return layout;
|
return layout;
|
||||||
}
|
}
|
||||||
""",
|
""",
|
||||||
Output('cytoscape-graph', 'layout', allow_duplicate=True),
|
Output('cytoscape-graph', 'layout', allow_duplicate=True),
|
||||||
Input('test_js', 'n_clicks'),
|
Input('trigger_relayout', 'n_clicks'),
|
||||||
State('cytoscape-graph', 'layout'),
|
State('cytoscape-graph', 'layout'),
|
||||||
prevent_initial_call=True,
|
prevent_initial_call=True,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1,368 +0,0 @@
|
|||||||
import json
|
|
||||||
import os
|
|
||||||
|
|
||||||
import dash
|
|
||||||
import dash_cytoscape as cyto
|
|
||||||
from dash import Input, Output, State, callback, dcc, html
|
|
||||||
|
|
||||||
# Load extra layouts
|
|
||||||
cyto.load_extra_layouts()
|
|
||||||
|
|
||||||
|
|
||||||
# Display utility functions
|
|
||||||
def _merge(a, b):
|
|
||||||
return dict(a, **b)
|
|
||||||
|
|
||||||
|
|
||||||
def _omit(omitted_keys, d):
|
|
||||||
return {k: v for k, v in d.items() if k not in omitted_keys}
|
|
||||||
|
|
||||||
|
|
||||||
# Custom Display Components
|
|
||||||
def Card(children, **kwargs):
|
|
||||||
return html.Section(
|
|
||||||
children,
|
|
||||||
style=_merge(
|
|
||||||
{
|
|
||||||
'padding': 20,
|
|
||||||
'margin': 5,
|
|
||||||
'borderRadius': 5,
|
|
||||||
'border': 'thin lightgrey solid',
|
|
||||||
'background-color': 'white',
|
|
||||||
# Remove possibility to select the text for better UX
|
|
||||||
'user-select': 'none',
|
|
||||||
'-moz-user-select': 'none',
|
|
||||||
'-webkit-user-select': 'none',
|
|
||||||
'-ms-user-select': 'none',
|
|
||||||
},
|
|
||||||
kwargs.get('style', {}),
|
|
||||||
),
|
|
||||||
**_omit(['style'], kwargs),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def SectionTitle(title, size, align='center', color='#222'):
|
|
||||||
return html.Div(
|
|
||||||
style={'text-align': align, 'color': color},
|
|
||||||
children=dcc.Markdown('#' * size + ' ' + title),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def NamedCard(title, size, children, **kwargs):
|
|
||||||
size = min(size, 6)
|
|
||||||
size = max(size, 1)
|
|
||||||
|
|
||||||
return html.Div([Card([SectionTitle(title, size, align='left')] + children, **kwargs)])
|
|
||||||
|
|
||||||
|
|
||||||
def NamedSlider(name, **kwargs):
|
|
||||||
return html.Div(
|
|
||||||
style={'padding': '20px 10px 25px 4px'},
|
|
||||||
children=[
|
|
||||||
html.P(f'{name}:'),
|
|
||||||
html.Div(style={'margin-left': '6px'}, children=dcc.Slider(**kwargs)),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def NamedDropdown(name, **kwargs):
|
|
||||||
return html.Div(
|
|
||||||
style={'margin': '10px 0px'},
|
|
||||||
children=[
|
|
||||||
html.P(children=f'{name}:', style={'margin-left': '3px'}),
|
|
||||||
dcc.Dropdown(**kwargs),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def NamedRadioItems(name, **kwargs):
|
|
||||||
return html.Div(
|
|
||||||
style={'padding': '20px 10px 25px 4px'},
|
|
||||||
children=[html.P(children=f'{name}:'), dcc.RadioItems(**kwargs)],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def NamedInput(name, **kwargs):
|
|
||||||
return html.Div(children=[html.P(children=f'{name}:'), dcc.Input(**kwargs)])
|
|
||||||
|
|
||||||
|
|
||||||
# Utils
|
|
||||||
def DropdownOptionsList(*args):
|
|
||||||
return [{'label': val.capitalize(), 'value': val} for val in args]
|
|
||||||
|
|
||||||
|
|
||||||
asset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', 'assets')
|
|
||||||
|
|
||||||
app = dash.Dash(__name__, assets_folder=asset_path)
|
|
||||||
server = app.server
|
|
||||||
|
|
||||||
|
|
||||||
# ###################### DATA PREPROCESSING ######################
|
|
||||||
# Load data
|
|
||||||
with open('sample_network.txt', 'r', encoding='utf-8') as f:
|
|
||||||
network_data = f.read().split('\n')
|
|
||||||
|
|
||||||
# We select the first 750 edges and associated nodes for an easier visualization
|
|
||||||
edges = network_data[:750]
|
|
||||||
nodes = set()
|
|
||||||
|
|
||||||
following_node_di = {} # user id -> list of users they are following
|
|
||||||
following_edges_di = {} # user id -> list of cy edges starting from user id
|
|
||||||
|
|
||||||
followers_node_di = {} # user id -> list of followers (cy_node format)
|
|
||||||
followers_edges_di = {} # user id -> list of cy edges ending at user id
|
|
||||||
|
|
||||||
cy_edges = []
|
|
||||||
cy_nodes = []
|
|
||||||
|
|
||||||
for edge in edges:
|
|
||||||
if ' ' not in edge:
|
|
||||||
continue
|
|
||||||
|
|
||||||
source, target = edge.split(' ')
|
|
||||||
|
|
||||||
cy_edge = {'data': {'id': source + target, 'source': source, 'target': target}}
|
|
||||||
cy_target = {'data': {'id': target, 'label': 'User #' + str(target[-5:])}}
|
|
||||||
cy_source = {'data': {'id': source, 'label': 'User #' + str(source[-5:])}}
|
|
||||||
|
|
||||||
if source not in nodes:
|
|
||||||
nodes.add(source)
|
|
||||||
cy_nodes.append(cy_source)
|
|
||||||
if target not in nodes:
|
|
||||||
nodes.add(target)
|
|
||||||
cy_nodes.append(cy_target)
|
|
||||||
|
|
||||||
# Process dictionary of following
|
|
||||||
if not following_node_di.get(source):
|
|
||||||
following_node_di[source] = []
|
|
||||||
if not following_edges_di.get(source):
|
|
||||||
following_edges_di[source] = []
|
|
||||||
|
|
||||||
following_node_di[source].append(cy_target)
|
|
||||||
following_edges_di[source].append(cy_edge)
|
|
||||||
|
|
||||||
# Process dictionary of followers
|
|
||||||
if not followers_node_di.get(target):
|
|
||||||
followers_node_di[target] = []
|
|
||||||
if not followers_edges_di.get(target):
|
|
||||||
followers_edges_di[target] = []
|
|
||||||
|
|
||||||
followers_node_di[target].append(cy_source)
|
|
||||||
followers_edges_di[target].append(cy_edge)
|
|
||||||
|
|
||||||
genesis_node = cy_nodes[0]
|
|
||||||
genesis_node['classes'] = 'genesis'
|
|
||||||
default_elements = [genesis_node]
|
|
||||||
|
|
||||||
default_stylesheet = [
|
|
||||||
{'selector': 'node', 'style': {'opacity': 0.65, 'z-index': 9999}},
|
|
||||||
{
|
|
||||||
'selector': 'edge',
|
|
||||||
'style': {'curve-style': 'bezier', 'opacity': 0.45, 'z-index': 5000},
|
|
||||||
},
|
|
||||||
{'selector': '.followerNode', 'style': {'background-color': '#0074D9'}},
|
|
||||||
{
|
|
||||||
'selector': '.followerEdge',
|
|
||||||
'style': {
|
|
||||||
'mid-target-arrow-color': 'blue',
|
|
||||||
'mid-target-arrow-shape': 'vee',
|
|
||||||
'line-color': '#0074D9',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{'selector': '.followingNode', 'style': {'background-color': '#FF4136'}},
|
|
||||||
{
|
|
||||||
'selector': '.followingEdge',
|
|
||||||
'style': {
|
|
||||||
'mid-target-arrow-color': 'red',
|
|
||||||
'mid-target-arrow-shape': 'vee',
|
|
||||||
'line-color': '#FF4136',
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'selector': '.genesis',
|
|
||||||
'style': {
|
|
||||||
'background-color': '#B10DC9',
|
|
||||||
'border-width': 2,
|
|
||||||
'border-color': 'purple',
|
|
||||||
'border-opacity': 1,
|
|
||||||
'opacity': 1,
|
|
||||||
'label': 'data(label)',
|
|
||||||
'color': '#B10DC9',
|
|
||||||
'text-opacity': 1,
|
|
||||||
'font-size': 12,
|
|
||||||
'z-index': 9999,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
'selector': ':selected',
|
|
||||||
'style': {
|
|
||||||
'border-width': 2,
|
|
||||||
'border-color': 'black',
|
|
||||||
'border-opacity': 1,
|
|
||||||
'opacity': 1,
|
|
||||||
'label': 'data(label)',
|
|
||||||
'color': 'black',
|
|
||||||
'font-size': 12,
|
|
||||||
'z-index': 9999,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
]
|
|
||||||
|
|
||||||
# ################################# APP LAYOUT ################################
|
|
||||||
styles = {
|
|
||||||
'json-output': {
|
|
||||||
'overflow-y': 'scroll',
|
|
||||||
'height': 'calc(50% - 25px)',
|
|
||||||
'border': 'thin lightgrey solid',
|
|
||||||
},
|
|
||||||
'tab': {'height': 'calc(98vh - 80px)'},
|
|
||||||
}
|
|
||||||
|
|
||||||
app.layout = html.Div(
|
|
||||||
[
|
|
||||||
html.Div(
|
|
||||||
className='eight columns',
|
|
||||||
children=[
|
|
||||||
cyto.Cytoscape(
|
|
||||||
id='cytoscape',
|
|
||||||
elements=default_elements,
|
|
||||||
stylesheet=default_stylesheet,
|
|
||||||
style={'height': '95vh', 'width': '100%'},
|
|
||||||
)
|
|
||||||
],
|
|
||||||
),
|
|
||||||
html.Div(
|
|
||||||
className='four columns',
|
|
||||||
children=[
|
|
||||||
dcc.Tabs(
|
|
||||||
id='tabs',
|
|
||||||
children=[
|
|
||||||
dcc.Tab(
|
|
||||||
label='Control Panel',
|
|
||||||
children=[
|
|
||||||
NamedDropdown(
|
|
||||||
name='Layout',
|
|
||||||
id='dropdown-layout',
|
|
||||||
options=DropdownOptionsList(
|
|
||||||
'random',
|
|
||||||
'grid',
|
|
||||||
'circle',
|
|
||||||
'concentric',
|
|
||||||
'breadthfirst',
|
|
||||||
'cose',
|
|
||||||
'cose-bilkent',
|
|
||||||
'dagre',
|
|
||||||
'cola',
|
|
||||||
'klay',
|
|
||||||
'spread',
|
|
||||||
'euler',
|
|
||||||
),
|
|
||||||
value='grid',
|
|
||||||
clearable=False,
|
|
||||||
),
|
|
||||||
NamedRadioItems(
|
|
||||||
name='Expand',
|
|
||||||
id='radio-expand',
|
|
||||||
options=DropdownOptionsList('followers', 'following'),
|
|
||||||
value='followers',
|
|
||||||
),
|
|
||||||
],
|
|
||||||
),
|
|
||||||
dcc.Tab(
|
|
||||||
label='JSON',
|
|
||||||
children=[
|
|
||||||
html.Div(
|
|
||||||
style=styles['tab'],
|
|
||||||
children=[
|
|
||||||
html.P('Node Object JSON:'),
|
|
||||||
html.Pre(
|
|
||||||
id='tap-node-json-output',
|
|
||||||
style=styles['json-output'],
|
|
||||||
),
|
|
||||||
html.P('Edge Object JSON:'),
|
|
||||||
html.Pre(
|
|
||||||
id='tap-edge-json-output',
|
|
||||||
style=styles['json-output'],
|
|
||||||
),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
],
|
|
||||||
),
|
|
||||||
],
|
|
||||||
),
|
|
||||||
],
|
|
||||||
),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ############################## CALLBACKS ####################################
|
|
||||||
@callback(Output('tap-node-json-output', 'children'), Input('cytoscape', 'tapNode'))
|
|
||||||
def display_tap_node(data):
|
|
||||||
return json.dumps(data, indent=2)
|
|
||||||
|
|
||||||
|
|
||||||
@callback(Output('tap-edge-json-output', 'children'), Input('cytoscape', 'tapEdge'))
|
|
||||||
def display_tap_edge(data):
|
|
||||||
return json.dumps(data, indent=2)
|
|
||||||
|
|
||||||
|
|
||||||
@callback(Output('cytoscape', 'layout'), Input('dropdown-layout', 'value'))
|
|
||||||
def update_cytoscape_layout(layout):
|
|
||||||
return {'name': layout}
|
|
||||||
|
|
||||||
|
|
||||||
@callback(
|
|
||||||
Output('cytoscape', 'elements'),
|
|
||||||
Input('cytoscape', 'tapNodeData'),
|
|
||||||
State('cytoscape', 'elements'),
|
|
||||||
State('radio-expand', 'value'),
|
|
||||||
)
|
|
||||||
def generate_elements(nodeData, elements, expansion_mode):
|
|
||||||
if not nodeData:
|
|
||||||
return default_elements
|
|
||||||
|
|
||||||
# If the node has already been expanded, we don't expand it again
|
|
||||||
if nodeData.get('expanded'):
|
|
||||||
return elements
|
|
||||||
|
|
||||||
# This retrieves the currently selected element, and tag it as expanded
|
|
||||||
for element in elements:
|
|
||||||
if nodeData['id'] == element.get('data').get('id'):
|
|
||||||
element['data']['expanded'] = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if expansion_mode == 'followers':
|
|
||||||
followers_nodes = followers_node_di.get(nodeData['id'])
|
|
||||||
followers_edges = followers_edges_di.get(nodeData['id'])
|
|
||||||
|
|
||||||
if followers_nodes:
|
|
||||||
for node in followers_nodes:
|
|
||||||
node['classes'] = 'followerNode'
|
|
||||||
elements.extend(followers_nodes)
|
|
||||||
|
|
||||||
if followers_edges:
|
|
||||||
for follower_edge in followers_edges:
|
|
||||||
follower_edge['classes'] = 'followerEdge'
|
|
||||||
elements.extend(followers_edges)
|
|
||||||
|
|
||||||
elif expansion_mode == 'following':
|
|
||||||
following_nodes = following_node_di.get(nodeData['id'])
|
|
||||||
following_edges = following_edges_di.get(nodeData['id'])
|
|
||||||
|
|
||||||
if following_nodes:
|
|
||||||
for node in following_nodes:
|
|
||||||
if node['data']['id'] != genesis_node['data']['id']:
|
|
||||||
node['classes'] = 'followingNode'
|
|
||||||
elements.append(node)
|
|
||||||
|
|
||||||
if following_edges:
|
|
||||||
for follower_edge in following_edges:
|
|
||||||
follower_edge['classes'] = 'followingEdge'
|
|
||||||
elements.extend(following_edges)
|
|
||||||
|
|
||||||
return elements
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
app.run_server(debug=True)
|
|
||||||
File diff suppressed because it is too large
Load Diff
507
scripts/dashboard/timeline.py
Normal file
507
scripts/dashboard/timeline.py
Normal file
@ -0,0 +1,507 @@
|
|||||||
|
import time
|
||||||
|
import webbrowser
|
||||||
|
from pathlib import Path
|
||||||
|
from threading import Thread
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
|
import dash_cytoscape as cyto
|
||||||
|
import pandas as pd
|
||||||
|
import plotly.express as px
|
||||||
|
from dash import (
|
||||||
|
Dash,
|
||||||
|
Input,
|
||||||
|
Output,
|
||||||
|
State,
|
||||||
|
callback,
|
||||||
|
dash_table,
|
||||||
|
dcc,
|
||||||
|
html,
|
||||||
|
)
|
||||||
|
from pandas import DataFrame
|
||||||
|
|
||||||
|
from lang_main.analysis import graphs
|
||||||
|
from lang_main.io import load_pickle
|
||||||
|
from lang_main.types import ObjectID, TimelineCandidates
|
||||||
|
from lang_main.analysis import tokens
|
||||||
|
from lang_main.constants import SPCY_MODEL
|
||||||
|
|
||||||
|
# df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/gapminder_unfiltered.csv')
|
||||||
|
|
||||||
|
# ** data
|
||||||
|
# p_df = Path(r'../Pipe-TargetFeature_Step-3_remove_NA.pkl').resolve()
|
||||||
|
p_df = Path(r'../results/test_20240619/TIMELINE.pkl').resolve()
|
||||||
|
# p_tl = Path(r'/Pipe-Timeline_Analysis_Step-4_get_timeline_candidates.pkl').resolve()
|
||||||
|
p_tl = Path(r'../results/test_20240619/TIMELINE_POSTPROCESSING.pkl').resolve()
|
||||||
|
ret = cast(tuple[DataFrame], load_pickle(p_df))
|
||||||
|
data = ret[0]
|
||||||
|
ret = cast(tuple[TimelineCandidates, dict[ObjectID, str]], load_pickle(p_tl))
|
||||||
|
cands = ret[0]
|
||||||
|
texts = ret[1]
|
||||||
|
|
||||||
|
# p_df = Path(r'.\test-notebooks\dashboard\data.pkl')
|
||||||
|
# p_cands = Path(r'.\test-notebooks\dashboard\map_candidates.pkl')
|
||||||
|
# p_map = Path(r'.\test-notebooks\dashboard\map_texts.pkl')
|
||||||
|
# data = cast(DataFrame, load_pickle(p_df))
|
||||||
|
# cands = cast(TimelineCandidates, load_pickle(p_cands))
|
||||||
|
# texts = cast(dict[ObjectID, str], load_pickle(p_map))
|
||||||
|
|
||||||
|
table_feats = [
|
||||||
|
'ErstellungsDatum',
|
||||||
|
'ErledigungsDatum',
|
||||||
|
'VorgangsTypName',
|
||||||
|
'VorgangsBeschreibung',
|
||||||
|
]
|
||||||
|
table_feats_dates = [
|
||||||
|
'ErstellungsDatum',
|
||||||
|
'ErledigungsDatum',
|
||||||
|
]
|
||||||
|
|
||||||
|
# ** figure config
|
||||||
|
markers = {
|
||||||
|
'size': 12,
|
||||||
|
'color': 'yellow',
|
||||||
|
'line': {
|
||||||
|
'width': 2,
|
||||||
|
'color': 'red',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
hover_data = {
|
||||||
|
'ErstellungsDatum': '|%d.%m.%Y',
|
||||||
|
'VorgangsBeschreibung': True,
|
||||||
|
}
|
||||||
|
|
||||||
|
# ** graphs
|
||||||
|
target = '../results/test_20240529/Pipe-Token_Analysis_Step-1_build_token_graph.pkl'
|
||||||
|
p = Path(target).resolve()
|
||||||
|
ret = load_pickle(p)
|
||||||
|
tk_graph = cast(graphs.TokenGraph, ret[0])
|
||||||
|
tk_graph_filtered = graphs.filter_graph_by_edge_weight(tk_graph, 150, None)
|
||||||
|
tk_graph_filtered = graphs.filter_graph_by_node_degree(tk_graph_filtered, 1, None)
|
||||||
|
# tk_graph_filtered = tk_graph.filter_by_edge_weight(150, None)
|
||||||
|
# tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1, None)
|
||||||
|
cyto_data_base, weight_data = graphs.convert_graph_to_cytoscape(tk_graph_filtered)
|
||||||
|
|
||||||
|
MIN_WEIGHT = weight_data['min']
|
||||||
|
MAX_WEIGHT = weight_data['max']
|
||||||
|
|
||||||
|
cyto.load_extra_layouts()
|
||||||
|
|
||||||
|
cose_layout = {
|
||||||
|
'name': 'cose',
|
||||||
|
'nodeOverlap': 500,
|
||||||
|
'refresh': 20,
|
||||||
|
'fit': True,
|
||||||
|
'padding': 20,
|
||||||
|
'randomize': False,
|
||||||
|
'componentSpacing': 1.2,
|
||||||
|
'nodeRepulsion': 1000,
|
||||||
|
'edgeElasticity': 1000,
|
||||||
|
'idealEdgeLength': 100,
|
||||||
|
'nestingFactor': 1.2,
|
||||||
|
'gravity': 50,
|
||||||
|
'numIter': 3000,
|
||||||
|
'initialTemp': 2000,
|
||||||
|
'coolingFactor': 0.7,
|
||||||
|
'minTemp': 1.0,
|
||||||
|
'nodeDimensionsIncludeLabels': True,
|
||||||
|
}
|
||||||
|
|
||||||
|
my_stylesheet = [
|
||||||
|
# Group selectors
|
||||||
|
{
|
||||||
|
'selector': 'node',
|
||||||
|
'style': {
|
||||||
|
'shape': 'circle',
|
||||||
|
'content': 'data(label)',
|
||||||
|
'background-color': '#B10DC9',
|
||||||
|
'border-width': 2,
|
||||||
|
'border-color': 'black',
|
||||||
|
'border-opacity': 1,
|
||||||
|
'opacity': 1,
|
||||||
|
'color': 'black',
|
||||||
|
'text-opacity': 1,
|
||||||
|
'font-size': 12,
|
||||||
|
'z-index': 9999,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'selector': 'edge',
|
||||||
|
'style': {
|
||||||
|
#'width': f'mapData(weight, {MIN_WEIGHT}, {MAX_WEIGHT}, 1, 10)',
|
||||||
|
# 'width': """function(ele) {
|
||||||
|
# return ele.data('weight');
|
||||||
|
# """,
|
||||||
|
'curve-style': 'bezier',
|
||||||
|
'line-color': 'grey',
|
||||||
|
'line-style': 'solid',
|
||||||
|
'line-opacity': 1,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
# Class selectors
|
||||||
|
# {'selector': '.red', 'style': {'background-color': 'red', 'line-color': 'red'}},
|
||||||
|
# {'selector': '.triangle', 'style': {'shape': 'triangle'}},
|
||||||
|
]
|
||||||
|
|
||||||
|
# ** app
|
||||||
|
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
|
||||||
|
app = Dash(__name__, external_stylesheets=external_stylesheets)
|
||||||
|
|
||||||
|
graph_layout = html.Div(
|
||||||
|
[
|
||||||
|
html.Button('Trigger JS Weight', id='test_js_weight'),
|
||||||
|
html.Button('Trigger Candidate Graph', id='cand_graph'),
|
||||||
|
html.Div(id='output'),
|
||||||
|
html.Div(
|
||||||
|
[
|
||||||
|
html.H2('Token Graph', style={'margin': 0}),
|
||||||
|
html.Button(
|
||||||
|
'Reset Default',
|
||||||
|
id='bt-reset',
|
||||||
|
style={
|
||||||
|
'marginLeft': 'auto',
|
||||||
|
'width': '300px',
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
style={
|
||||||
|
'display': 'flex',
|
||||||
|
'marginBottom': '1em',
|
||||||
|
},
|
||||||
|
),
|
||||||
|
html.H3('Layout'),
|
||||||
|
dcc.Dropdown(
|
||||||
|
id='layout_choice',
|
||||||
|
options=[
|
||||||
|
'cose',
|
||||||
|
'cola',
|
||||||
|
'euler',
|
||||||
|
'random',
|
||||||
|
],
|
||||||
|
value='cose',
|
||||||
|
clearable=False,
|
||||||
|
),
|
||||||
|
html.Div(
|
||||||
|
[
|
||||||
|
html.H3('Graph Filter'),
|
||||||
|
dcc.Input(
|
||||||
|
id='weight_min',
|
||||||
|
type='number',
|
||||||
|
min=MIN_WEIGHT,
|
||||||
|
max=MAX_WEIGHT,
|
||||||
|
step=1,
|
||||||
|
placeholder=f'Minimum edge weight: {MIN_WEIGHT} - {MAX_WEIGHT}',
|
||||||
|
debounce=True,
|
||||||
|
style={'width': '40%'},
|
||||||
|
),
|
||||||
|
dcc.Input(
|
||||||
|
id='weight_max',
|
||||||
|
type='number',
|
||||||
|
min=MIN_WEIGHT,
|
||||||
|
max=MAX_WEIGHT,
|
||||||
|
step=1,
|
||||||
|
placeholder=f'Maximum edge weight: {MIN_WEIGHT} - {MAX_WEIGHT}',
|
||||||
|
debounce=True,
|
||||||
|
style={'width': '40%'},
|
||||||
|
),
|
||||||
|
html.H3('Graph'),
|
||||||
|
html.Button('Re-Layout', id='trigger_relayout'),
|
||||||
|
html.Div(
|
||||||
|
[
|
||||||
|
cyto.Cytoscape(
|
||||||
|
id='cytoscape-graph',
|
||||||
|
style={'width': '100%', 'height': '600px'},
|
||||||
|
layout=cose_layout,
|
||||||
|
stylesheet=my_stylesheet,
|
||||||
|
elements=cyto_data_base,
|
||||||
|
zoom=1,
|
||||||
|
),
|
||||||
|
],
|
||||||
|
style={
|
||||||
|
'border': '3px solid black',
|
||||||
|
'borderRadius': '25px',
|
||||||
|
'marginTop': '1em',
|
||||||
|
'marginBottom': '2em',
|
||||||
|
'padding': '7px',
|
||||||
|
},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
style={'marginTop': '1em'},
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
app.layout = html.Div(
|
||||||
|
[
|
||||||
|
html.H1(children='Demo Zeitreihenanalyse', style={'textAlign': 'center'}),
|
||||||
|
html.Div(
|
||||||
|
children=[
|
||||||
|
html.H2('Wählen Sie ein Objekt aus (ObjektID):'),
|
||||||
|
dcc.Dropdown(
|
||||||
|
list(cands.keys()),
|
||||||
|
id='dropdown-selection',
|
||||||
|
placeholder='ObjektID auswählen...',
|
||||||
|
),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
html.Div(
|
||||||
|
children=[
|
||||||
|
html.H3(id='object_text'),
|
||||||
|
dcc.Dropdown(id='choice-candidates'),
|
||||||
|
dcc.Graph(id='graph-output'),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
html.Div(
|
||||||
|
[dash_table.DataTable(id='table-candidates')], style={'marginBottom': '2em'}
|
||||||
|
),
|
||||||
|
graph_layout,
|
||||||
|
],
|
||||||
|
style={'margin': '2em'},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@callback(
|
||||||
|
Output('object_text', 'children'),
|
||||||
|
Input('dropdown-selection', 'value'),
|
||||||
|
prevent_initial_call=True,
|
||||||
|
)
|
||||||
|
def update_obj_text(obj_id):
|
||||||
|
obj_id = int(obj_id)
|
||||||
|
obj_text = texts[obj_id]
|
||||||
|
headline = f'HObjektText: {obj_text}'
|
||||||
|
return headline
|
||||||
|
|
||||||
|
|
||||||
|
@callback(
|
||||||
|
Output('choice-candidates', 'options'),
|
||||||
|
Input('dropdown-selection', 'value'),
|
||||||
|
prevent_initial_call=True,
|
||||||
|
)
|
||||||
|
def update_choice_candidates(obj_id):
|
||||||
|
obj_id = int(obj_id)
|
||||||
|
cands_obj_id = cands[obj_id]
|
||||||
|
choices = list(range(1, len(cands_obj_id) + 1))
|
||||||
|
return choices
|
||||||
|
|
||||||
|
|
||||||
|
@callback(
|
||||||
|
Output('graph-output', 'figure'),
|
||||||
|
Input('choice-candidates', 'value'),
|
||||||
|
State('dropdown-selection', 'value'),
|
||||||
|
prevent_initial_call=True,
|
||||||
|
)
|
||||||
|
def update_timeline(index, obj_id):
|
||||||
|
obj_id = int(obj_id)
|
||||||
|
# title
|
||||||
|
obj_text = texts[obj_id]
|
||||||
|
title = f'HObjektText: {obj_text}'
|
||||||
|
# cands
|
||||||
|
cands_obj_id = cands[obj_id]
|
||||||
|
cands_choice = cands_obj_id[int(index) - 1]
|
||||||
|
# data
|
||||||
|
df = data.loc[list(cands_choice)].sort_index() # type: ignore
|
||||||
|
# figure
|
||||||
|
fig = px.line(
|
||||||
|
data_frame=df,
|
||||||
|
x='ErstellungsDatum',
|
||||||
|
y='ObjektID',
|
||||||
|
title=title,
|
||||||
|
hover_data=hover_data,
|
||||||
|
)
|
||||||
|
fig.update_traces(mode='markers+lines', marker=markers, marker_symbol='diamond')
|
||||||
|
fig.update_xaxes(
|
||||||
|
tickformat='%B\n%Y',
|
||||||
|
rangeslider_visible=True,
|
||||||
|
)
|
||||||
|
fig.update_yaxes(type='category')
|
||||||
|
fig.update_layout(hovermode='x unified')
|
||||||
|
return fig
|
||||||
|
|
||||||
|
|
||||||
|
@callback(
|
||||||
|
[Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
|
||||||
|
Input('choice-candidates', 'value'),
|
||||||
|
State('dropdown-selection', 'value'),
|
||||||
|
prevent_initial_call=True,
|
||||||
|
)
|
||||||
|
def update_table_candidates(index, obj_id):
|
||||||
|
# obj_id = int(obj_id)
|
||||||
|
# # cands
|
||||||
|
# cands_obj_id = cands[obj_id]
|
||||||
|
# cands_choice = cands_obj_id[int(index) - 1]
|
||||||
|
# # data
|
||||||
|
# df = data.loc[list(cands_choice)].sort_index() # type: ignore
|
||||||
|
df = pre_filter_data(data, idx=index, obj_id=obj_id)
|
||||||
|
df = df.filter(items=table_feats, axis=1).sort_values(
|
||||||
|
by='ErstellungsDatum', ascending=True
|
||||||
|
)
|
||||||
|
cols = [{'name': i, 'id': i} for i in df.columns]
|
||||||
|
# convert dates to strings
|
||||||
|
for col in table_feats_dates:
|
||||||
|
df[col] = df[col].dt.strftime(r'%Y-%m-%d')
|
||||||
|
|
||||||
|
table_data = df.to_dict('records')
|
||||||
|
return table_data, cols
|
||||||
|
|
||||||
|
|
||||||
|
def pre_filter_data(
|
||||||
|
data: DataFrame,
|
||||||
|
idx: int,
|
||||||
|
obj_id: ObjectID,
|
||||||
|
) -> DataFrame:
|
||||||
|
obj_id = int(obj_id)
|
||||||
|
data = data.copy()
|
||||||
|
# cands
|
||||||
|
cands_obj_id = cands[obj_id]
|
||||||
|
cands_choice = cands_obj_id[int(idx) - 1]
|
||||||
|
# data
|
||||||
|
data = data.loc[list(cands_choice)].sort_index() # type: ignore
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
# ** graph
|
||||||
|
@app.callback(
|
||||||
|
Output('cytoscape-graph', 'elements', allow_duplicate=True),
|
||||||
|
Output('weight_min', 'min', allow_duplicate=True),
|
||||||
|
Output('weight_min', 'max', allow_duplicate=True),
|
||||||
|
Output('weight_min', 'placeholder', allow_duplicate=True),
|
||||||
|
Output('weight_max', 'min', allow_duplicate=True),
|
||||||
|
Output('weight_max', 'max', allow_duplicate=True),
|
||||||
|
Output('weight_max', 'placeholder', allow_duplicate=True),
|
||||||
|
Input('cand_graph', 'n_clicks'),
|
||||||
|
State('choice-candidates', 'value'),
|
||||||
|
State('dropdown-selection', 'value'),
|
||||||
|
prevent_initial_call=True,
|
||||||
|
)
|
||||||
|
def update_graph_candidates(_, index, obj_id):
|
||||||
|
df = pre_filter_data(data, idx=index, obj_id=obj_id)
|
||||||
|
tk_graph_cands, _ = tokens.build_token_graph(
|
||||||
|
data=df,
|
||||||
|
model=SPCY_MODEL,
|
||||||
|
target_feature='VorgangsBeschreibung',
|
||||||
|
build_map=False,
|
||||||
|
)
|
||||||
|
cyto_data, weight_info = graphs.convert_graph_to_cytoscape(tk_graph_cands)
|
||||||
|
weight_min = weight_info['min']
|
||||||
|
weight_max = weight_info['max']
|
||||||
|
placeholder_min = f'Minimum edge weight: {weight_min} - {weight_max}'
|
||||||
|
placeholder_max = f'Minimum edge weight: {weight_min} - {weight_max}'
|
||||||
|
return (
|
||||||
|
cyto_data,
|
||||||
|
weight_min,
|
||||||
|
weight_max,
|
||||||
|
placeholder_min,
|
||||||
|
weight_min,
|
||||||
|
weight_max,
|
||||||
|
placeholder_max,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.callback(
|
||||||
|
Output('cytoscape-graph', 'layout', allow_duplicate=True),
|
||||||
|
Input('layout_choice', 'value'),
|
||||||
|
prevent_initial_call=True,
|
||||||
|
)
|
||||||
|
def update_layout_internal(layout_choice):
|
||||||
|
# return {'name': layout_choice}
|
||||||
|
return cose_layout
|
||||||
|
# return cose_bilkent_layout
|
||||||
|
# return cola_layout
|
||||||
|
|
||||||
|
|
||||||
|
@app.callback(
|
||||||
|
Output('cytoscape-graph', 'zoom'),
|
||||||
|
Output('cytoscape-graph', 'elements', allow_duplicate=True),
|
||||||
|
Output('weight_min', 'value'),
|
||||||
|
Output('weight_max', 'value'),
|
||||||
|
Input('bt-reset', 'n_clicks'),
|
||||||
|
prevent_initial_call=True,
|
||||||
|
)
|
||||||
|
def reset_layout(n_clicks):
|
||||||
|
return (1, cyto_data_base, None, None)
|
||||||
|
|
||||||
|
|
||||||
|
# update edge weight
|
||||||
|
@app.callback(
|
||||||
|
Output('cytoscape-graph', 'elements', allow_duplicate=True),
|
||||||
|
Input('weight_min', 'value'),
|
||||||
|
Input('weight_max', 'value'),
|
||||||
|
prevent_initial_call=True,
|
||||||
|
)
|
||||||
|
def update_edge_weight(weight_min, weight_max):
|
||||||
|
if not any([weight_min, weight_max]):
|
||||||
|
return cyto_data_base
|
||||||
|
|
||||||
|
if weight_min is None:
|
||||||
|
weight_min = MIN_WEIGHT
|
||||||
|
if weight_max is None:
|
||||||
|
weight_max = MAX_WEIGHT
|
||||||
|
tk_graph_filtered = graphs.filter_graph_by_edge_weight(tk_graph, weight_min, weight_max)
|
||||||
|
# tk_graph_filtered = tk_graph.filter_by_edge_weight(weight_min, weight_max)
|
||||||
|
tk_graph_filtered = graphs.filter_graph_by_node_degree(tk_graph_filtered, 1, None)
|
||||||
|
# tk_graph_filtered = tk_graph_filtered.filter_by_node_degree(1, None)
|
||||||
|
cyto_data, _ = graphs.convert_graph_to_cytoscape(tk_graph_filtered)
|
||||||
|
return cyto_data
|
||||||
|
|
||||||
|
|
||||||
|
app.clientside_callback(
|
||||||
|
"""
|
||||||
|
function(n_clicks, layout) {
|
||||||
|
layout.edgeElasticity = function(edge) {
|
||||||
|
return edge.data().weight * 0.05;
|
||||||
|
};
|
||||||
|
layout.idealEdgeLength = function(edge) {
|
||||||
|
return edge.data().weight * 0.4;
|
||||||
|
};
|
||||||
|
cy.layout(layout).run();
|
||||||
|
return layout;
|
||||||
|
}
|
||||||
|
""",
|
||||||
|
Output('cytoscape-graph', 'layout', allow_duplicate=True),
|
||||||
|
Input('trigger_relayout', 'n_clicks'),
|
||||||
|
State('cytoscape-graph', 'layout'),
|
||||||
|
prevent_initial_call=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
app.clientside_callback(
|
||||||
|
"""
|
||||||
|
function(n_clicks, stylesheet) {
|
||||||
|
function edge_weight(ele) {
|
||||||
|
let threshold = 1000;
|
||||||
|
let weight = ele.data('weight');
|
||||||
|
if (weight > threshold) {
|
||||||
|
weight = 12;
|
||||||
|
} else {
|
||||||
|
weight = weight / threshold * 10;
|
||||||
|
weight = Math.max(1, weight);
|
||||||
|
}
|
||||||
|
return weight;
|
||||||
|
}
|
||||||
|
stylesheet[1].style.width = edge_weight;
|
||||||
|
cy.style(stylesheet).update();
|
||||||
|
return stylesheet;
|
||||||
|
}
|
||||||
|
""",
|
||||||
|
Output('cytoscape-graph', 'stylesheet'),
|
||||||
|
Input('test_js_weight', 'n_clicks'),
|
||||||
|
State('cytoscape-graph', 'stylesheet'),
|
||||||
|
prevent_initial_call=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _start_webbrowser():
|
||||||
|
host = '127.0.0.1'
|
||||||
|
port = '8050'
|
||||||
|
adress = f'http://{host}:{port}/'
|
||||||
|
time.sleep(2)
|
||||||
|
webbrowser.open_new(adress)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
webbrowser_thread = Thread(target=_start_webbrowser, daemon=True)
|
||||||
|
webbrowser_thread.start()
|
||||||
|
app.run(debug=True)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
@ -1,38 +0,0 @@
|
|||||||
# lang_main: Config file
|
|
||||||
|
|
||||||
[paths]
|
|
||||||
inputs = 'A:/Arbeitsaufgaben/lang-main/scripts'
|
|
||||||
results = 'A:/Arbeitsaufgaben/lang-main/scripts/results/test_20240529/'
|
|
||||||
dataset = 'A:/Arbeitsaufgaben/lang-main/data/02_202307/Export4.csv'
|
|
||||||
#results = './results/Export7/'
|
|
||||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
|
||||||
#results = './results/Export7_trunc/'
|
|
||||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
|
||||||
|
|
||||||
[control]
|
|
||||||
preprocessing = true
|
|
||||||
preprocessing_skip = false
|
|
||||||
token_analysis = false
|
|
||||||
token_analysis_skip = true
|
|
||||||
graph_postprocessing = false
|
|
||||||
graph_postprocessing_skip = true
|
|
||||||
|
|
||||||
#[export_filenames]
|
|
||||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
|
||||||
|
|
||||||
[preprocess]
|
|
||||||
filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
|
||||||
date_cols = [
|
|
||||||
"VorgangsDatum",
|
|
||||||
"ErledigungsDatum",
|
|
||||||
"Arbeitsbeginn",
|
|
||||||
"ErstellungsDatum",
|
|
||||||
]
|
|
||||||
threshold_amount_characters = 5
|
|
||||||
threshold_similarity = 0.8
|
|
||||||
|
|
||||||
[graph_postprocessing]
|
|
||||||
threshold_edge_weight = 150
|
|
||||||
|
|
||||||
[time_analysis]
|
|
||||||
threshold_unique_texts = 5
|
|
||||||
@ -2,22 +2,20 @@
|
|||||||
|
|
||||||
[paths]
|
[paths]
|
||||||
inputs = './inputs/'
|
inputs = './inputs/'
|
||||||
results = './results/test_20240529/'
|
results = './results/test_20240619/'
|
||||||
dataset = '../data/02_202307/Export4.csv'
|
dataset = '../data/02_202307/Export4.csv'
|
||||||
#results = './results/Export7/'
|
#results = './results/Export7/'
|
||||||
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
#dataset = './01_03_Rohdaten_202403/Export7_59499_Zeilen.csv'
|
||||||
#results = './results/Export7_trunc/'
|
#results = './results/Export7_trunc/'
|
||||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||||
|
|
||||||
|
# only debugging features, production-ready pipelines should always
|
||||||
|
# be fully executed
|
||||||
[control]
|
[control]
|
||||||
preprocessing = false
|
preprocessing_skip = true
|
||||||
preprocessing_skip = false
|
token_analysis_skip = true
|
||||||
token_analysis = true
|
|
||||||
token_analysis_skip = false
|
|
||||||
graph_postprocessing = false
|
|
||||||
graph_postprocessing_skip = true
|
graph_postprocessing_skip = true
|
||||||
time_analysis = false
|
time_analysis_skip = false
|
||||||
time_analysis_skip = true
|
|
||||||
|
|
||||||
#[export_filenames]
|
#[export_filenames]
|
||||||
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
#filename_cossim_filter_candidates = 'CosSim-FilterCandidates'
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
import sys
|
import sys
|
||||||
import typing
|
import typing
|
||||||
@ -169,6 +171,90 @@ def convert_graph_to_cytoscape(
|
|||||||
return cyto_data, weight_metadata
|
return cyto_data, weight_metadata
|
||||||
|
|
||||||
|
|
||||||
|
def filter_graph_by_edge_weight(
|
||||||
|
graph: TokenGraph,
|
||||||
|
bound_lower: int | None,
|
||||||
|
bound_upper: int | None,
|
||||||
|
) -> TokenGraph:
|
||||||
|
"""filters all edges which are within the provided bounds
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
bound_lower : int | None
|
||||||
|
lower bound for edge weights, edges with weight equal to this value are retained
|
||||||
|
bound_upper : int | None
|
||||||
|
upper bound for edge weights, edges with weight equal to this value are retained
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
TokenGraph
|
||||||
|
a copy of the graph with filtered edges
|
||||||
|
"""
|
||||||
|
original_graph_edges = copy.deepcopy(graph.edges)
|
||||||
|
filtered_graph = graph.copy()
|
||||||
|
|
||||||
|
if not any([bound_lower, bound_upper]):
|
||||||
|
logger.warning('No bounds provided, returning original graph.')
|
||||||
|
return filtered_graph
|
||||||
|
|
||||||
|
for edge in original_graph_edges:
|
||||||
|
weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
|
||||||
|
if bound_lower is not None and weight < bound_lower:
|
||||||
|
filtered_graph.remove_edge(edge[0], edge[1])
|
||||||
|
if bound_upper is not None and weight > bound_upper:
|
||||||
|
filtered_graph.remove_edge(edge[0], edge[1])
|
||||||
|
|
||||||
|
if filtered_graph._undirected is not None:
|
||||||
|
filtered_graph.to_undirected(inplace=True, logging=False)
|
||||||
|
|
||||||
|
filtered_graph.update_metadata(logging=False)
|
||||||
|
|
||||||
|
return filtered_graph
|
||||||
|
|
||||||
|
|
||||||
|
def filter_graph_by_node_degree(
|
||||||
|
graph: TokenGraph,
|
||||||
|
bound_lower: int | None,
|
||||||
|
bound_upper: int | None,
|
||||||
|
) -> TokenGraph:
|
||||||
|
"""filters all nodes which are within the provided bounds by their degree
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
bound_lower : int | None
|
||||||
|
lower bound for node degree, nodes with degree equal to this value are retained
|
||||||
|
bound_upper : int | None
|
||||||
|
upper bound for node degree, nodes with degree equal to this value are retained
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
TokenGraph
|
||||||
|
a copy of the graph with filtered nodes
|
||||||
|
"""
|
||||||
|
# filter nodes by degree
|
||||||
|
original_graph_nodes = copy.deepcopy(graph.nodes)
|
||||||
|
filtered_graph = graph.copy()
|
||||||
|
|
||||||
|
if not any([bound_lower, bound_upper]):
|
||||||
|
logger.warning('No bounds provided, returning original graph.')
|
||||||
|
return filtered_graph
|
||||||
|
|
||||||
|
for node in original_graph_nodes:
|
||||||
|
degree = filtered_graph.degree[node] # type: ignore
|
||||||
|
if bound_lower is not None and degree < bound_lower:
|
||||||
|
filtered_graph.remove_node(node)
|
||||||
|
if bound_upper is not None and degree > bound_upper:
|
||||||
|
filtered_graph.remove_node(node)
|
||||||
|
|
||||||
|
if filtered_graph._undirected is not None:
|
||||||
|
filtered_graph.to_undirected(inplace=True, logging=False)
|
||||||
|
|
||||||
|
filtered_graph.update_metadata(logging=False)
|
||||||
|
|
||||||
|
return filtered_graph
|
||||||
|
|
||||||
|
|
||||||
|
# ** ---------------------------------------
|
||||||
class TokenGraph(DiGraph):
|
class TokenGraph(DiGraph):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -286,87 +372,6 @@ class TokenGraph(DiGraph):
|
|||||||
graph=self._undirected, logging=logging
|
graph=self._undirected, logging=logging
|
||||||
)
|
)
|
||||||
|
|
||||||
def filter_by_edge_weight(
|
|
||||||
self,
|
|
||||||
bound_lower: int | None,
|
|
||||||
bound_upper: int | None,
|
|
||||||
) -> Self:
|
|
||||||
"""filters all edges which are within the provided bounds
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
bound_lower : int | None
|
|
||||||
lower bound for edge weights, edges with weight equal to this value are retained
|
|
||||||
bound_upper : int | None
|
|
||||||
upper bound for edge weights, edges with weight equal to this value are retained
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Self
|
|
||||||
a copy of the graph with filtered edges
|
|
||||||
"""
|
|
||||||
original_graph_edges = copy.deepcopy(self.edges)
|
|
||||||
filtered_graph = self.copy()
|
|
||||||
|
|
||||||
if not any([bound_lower, bound_upper]):
|
|
||||||
logger.warning('No bounds provided, returning original graph.')
|
|
||||||
return filtered_graph
|
|
||||||
|
|
||||||
for edge in original_graph_edges:
|
|
||||||
weight = typing.cast(int, filtered_graph[edge[0]][edge[1]]['weight'])
|
|
||||||
if bound_lower is not None and weight < bound_lower:
|
|
||||||
filtered_graph.remove_edge(edge[0], edge[1])
|
|
||||||
if bound_upper is not None and weight > bound_upper:
|
|
||||||
filtered_graph.remove_edge(edge[0], edge[1])
|
|
||||||
|
|
||||||
if filtered_graph._undirected is not None:
|
|
||||||
filtered_graph.to_undirected(inplace=True, logging=False)
|
|
||||||
|
|
||||||
filtered_graph.update_metadata(logging=False)
|
|
||||||
|
|
||||||
return filtered_graph
|
|
||||||
|
|
||||||
def filter_by_node_degree(
|
|
||||||
self,
|
|
||||||
bound_lower: int | None,
|
|
||||||
bound_upper: int | None,
|
|
||||||
) -> Self:
|
|
||||||
"""filters all nodes which are within the provided bounds by their degree
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
bound_lower : int | None
|
|
||||||
lower bound for node degree, nodes with degree equal to this value are retained
|
|
||||||
bound_upper : int | None
|
|
||||||
upper bound for node degree, nodes with degree equal to this value are retained
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
Self
|
|
||||||
a copy of the graph with filtered nodes
|
|
||||||
"""
|
|
||||||
# filter nodes by degree
|
|
||||||
original_graph_nodes = copy.deepcopy(self.nodes)
|
|
||||||
filtered_graph = self.copy()
|
|
||||||
|
|
||||||
if not any([bound_lower, bound_upper]):
|
|
||||||
logger.warning('No bounds provided, returning original graph.')
|
|
||||||
return filtered_graph
|
|
||||||
|
|
||||||
for node in original_graph_nodes:
|
|
||||||
degree = filtered_graph.degree[node] # type: ignore
|
|
||||||
if bound_lower is not None and degree < bound_lower:
|
|
||||||
filtered_graph.remove_node(node)
|
|
||||||
if bound_upper is not None and degree > bound_upper:
|
|
||||||
filtered_graph.remove_node(node)
|
|
||||||
|
|
||||||
if filtered_graph._undirected is not None:
|
|
||||||
filtered_graph.to_undirected(inplace=True, logging=False)
|
|
||||||
|
|
||||||
filtered_graph.update_metadata(logging=False)
|
|
||||||
|
|
||||||
return filtered_graph
|
|
||||||
|
|
||||||
def _save_prepare(
|
def _save_prepare(
|
||||||
self,
|
self,
|
||||||
path: Path,
|
path: Path,
|
||||||
@ -379,14 +384,13 @@ class TokenGraph(DiGraph):
|
|||||||
|
|
||||||
return saving_path
|
return saving_path
|
||||||
|
|
||||||
def save_graph(
|
def to_GraphML(
|
||||||
self,
|
self,
|
||||||
path: Path,
|
path: Path,
|
||||||
filename: str | None = None,
|
filename: str | None = None,
|
||||||
directed: bool = False,
|
directed: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""save one of the stored graphs to disk file,
|
"""save one of the stored graphs to GraphML format on disk,
|
||||||
currently only GraphML format is supported
|
|
||||||
|
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
|
|||||||
@ -22,7 +22,7 @@ from lang_main.analysis.shared import (
|
|||||||
similar_index_groups,
|
similar_index_groups,
|
||||||
)
|
)
|
||||||
from lang_main.loggers import logger_preprocess as logger
|
from lang_main.loggers import logger_preprocess as logger
|
||||||
from lang_main.pipelines.base import BasePipeline
|
from lang_main.pipelines.base import Pipeline
|
||||||
from lang_main.types import Embedding, PandasIndex
|
from lang_main.types import Embedding, PandasIndex
|
||||||
|
|
||||||
# ** RE patterns
|
# ** RE patterns
|
||||||
@ -119,10 +119,9 @@ def remove_duplicates(
|
|||||||
).copy()
|
).copy()
|
||||||
logger.info('Removed all duplicates from dataset successfully.')
|
logger.info('Removed all duplicates from dataset successfully.')
|
||||||
logger.info(
|
logger.info(
|
||||||
(
|
'New Dataset properties: number of entries: %d, number of features %d',
|
||||||
f'New Dataset properties: number of entries: {len(wo_duplicates)}, '
|
len(wo_duplicates),
|
||||||
f'number of features {len(wo_duplicates.columns)}'
|
len(wo_duplicates.columns),
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return (wo_duplicates,)
|
return (wo_duplicates,)
|
||||||
@ -176,6 +175,7 @@ def clean_string_slim(string: str) -> str:
|
|||||||
string = pattern_special_chars.sub(' ', string)
|
string = pattern_special_chars.sub(' ', string)
|
||||||
string = pattern_repeated_chars.sub(r'\1', string)
|
string = pattern_repeated_chars.sub(r'\1', string)
|
||||||
# string = pattern_dates.sub('', string)
|
# string = pattern_dates.sub('', string)
|
||||||
|
# dates are used for context, should not be removed at this stage
|
||||||
string = pattern_whitespace.sub(' ', string)
|
string = pattern_whitespace.sub(' ', string)
|
||||||
# remove whitespaces at the beginning and the end
|
# remove whitespaces at the beginning and the end
|
||||||
string = string.strip()
|
string = string.strip()
|
||||||
@ -241,11 +241,84 @@ def analyse_feature(
|
|||||||
return (result_df,)
|
return (result_df,)
|
||||||
|
|
||||||
|
|
||||||
|
# ** pre-filter
|
||||||
|
def numeric_pre_filter_feature(
|
||||||
|
data: DataFrame,
|
||||||
|
feature: str,
|
||||||
|
bound_lower: int | None,
|
||||||
|
bound_upper: int | None,
|
||||||
|
) -> tuple[DataFrame]:
|
||||||
|
if not any([bound_lower, bound_upper]):
|
||||||
|
raise ValueError('No bounds for filtering provided')
|
||||||
|
|
||||||
|
data = data.copy()
|
||||||
|
if bound_lower is None:
|
||||||
|
bound_lower = cast(int, data[feature].min())
|
||||||
|
if bound_upper is None:
|
||||||
|
bound_upper = cast(int, data[feature].max())
|
||||||
|
|
||||||
|
filter_lower = data[feature] >= bound_lower
|
||||||
|
filter_upper = data[feature] <= bound_upper
|
||||||
|
filter = filter_lower & filter_upper
|
||||||
|
|
||||||
|
data = data.loc[filter]
|
||||||
|
|
||||||
|
return (data,)
|
||||||
|
|
||||||
|
|
||||||
# ** embedding based similarity
|
# ** embedding based similarity
|
||||||
# following functions used to identify similar entries to have
|
# following functions used to identify similar entries to have
|
||||||
# a more robust identification of duplicates negating negative side effects
|
# a more robust identification of duplicates negating negative side effects
|
||||||
# of several disturbances like typos, escape characters, etc.
|
# of several disturbances like typos, escape characters, etc.
|
||||||
# build mapping of embeddings for given model
|
# build mapping of embeddings for given model
|
||||||
|
def merge_similarity_dupl(
|
||||||
|
data: DataFrame,
|
||||||
|
model: SentenceTransformer,
|
||||||
|
cos_sim_threshold: float,
|
||||||
|
) -> tuple[DataFrame]:
|
||||||
|
logger.info('Start merging of similarity candidates...')
|
||||||
|
|
||||||
|
# data
|
||||||
|
merged_data = data.copy()
|
||||||
|
model_input = merged_data['entry']
|
||||||
|
candidates_idx = candidates_by_index(
|
||||||
|
data_model_input=model_input,
|
||||||
|
model=model,
|
||||||
|
cos_sim_threshold=cos_sim_threshold,
|
||||||
|
)
|
||||||
|
# graph of similar ids
|
||||||
|
similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
|
||||||
|
|
||||||
|
for similar_id_group in similar_index_groups(similar_id_graph):
|
||||||
|
similar_id_group = list(similar_id_group)
|
||||||
|
similar_data = merged_data.loc[similar_id_group, :]
|
||||||
|
# keep first entry with max number occurrences, then number of
|
||||||
|
# associated objects, then length of entry
|
||||||
|
similar_data = similar_data.sort_values(
|
||||||
|
by=['num_occur', 'num_assoc_obj_ids', 'len'],
|
||||||
|
ascending=[False, False, False],
|
||||||
|
)
|
||||||
|
# merge information to first entry
|
||||||
|
data_idx = cast(PandasIndex, similar_data.index[0])
|
||||||
|
similar_data.at[data_idx, 'num_occur'] = similar_data['num_occur'].sum()
|
||||||
|
assoc_obj_ids = similar_data['assoc_obj_ids'].to_numpy()
|
||||||
|
assoc_obj_ids = np.concatenate(assoc_obj_ids)
|
||||||
|
assoc_obj_ids = np.unique(assoc_obj_ids)
|
||||||
|
similar_data.at[data_idx, 'assoc_obj_ids'] = assoc_obj_ids
|
||||||
|
similar_data.at[data_idx, 'num_assoc_obj_ids'] = len(assoc_obj_ids)
|
||||||
|
# remaining indices, should be removed
|
||||||
|
similar_id_group.remove(data_idx)
|
||||||
|
merged_similar_data = similar_data.drop(index=similar_id_group)
|
||||||
|
# update entry in main dataset, drop remaining entries
|
||||||
|
merged_data.update(merged_similar_data)
|
||||||
|
merged_data = merged_data.drop(index=similar_id_group)
|
||||||
|
|
||||||
|
logger.info('Similarity candidates merged successfully.')
|
||||||
|
|
||||||
|
return (merged_data,)
|
||||||
|
|
||||||
|
|
||||||
|
#####################################################################
|
||||||
def build_embedding_map(
|
def build_embedding_map(
|
||||||
data: Series,
|
data: Series,
|
||||||
model: GermanSpacyModel | SentenceTransformer,
|
model: GermanSpacyModel | SentenceTransformer,
|
||||||
@ -373,7 +446,7 @@ def list_cosSim_dupl_candidates(
|
|||||||
save_candidates: bool = False,
|
save_candidates: bool = False,
|
||||||
saving_path: Path | None = None,
|
saving_path: Path | None = None,
|
||||||
filename: str = 'CosSim-FilterCandidates',
|
filename: str = 'CosSim-FilterCandidates',
|
||||||
pipeline: BasePipeline | None = None,
|
pipeline: Pipeline | None = None,
|
||||||
) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]:
|
) -> tuple[list[tuple[PandasIndex, PandasIndex]], dict[int, tuple[Embedding, str]]]:
|
||||||
"""providing an overview of candidates with a similarity score greater than
|
"""providing an overview of candidates with a similarity score greater than
|
||||||
given threshold; more suitable for debugging purposes
|
given threshold; more suitable for debugging purposes
|
||||||
@ -465,53 +538,6 @@ def similar_ids_groups(
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def merge_similarity_dupl(
|
|
||||||
data: DataFrame,
|
|
||||||
model: SentenceTransformer,
|
|
||||||
cos_sim_threshold: float,
|
|
||||||
) -> tuple[DataFrame]:
|
|
||||||
logger.info('Start merging of similarity candidates...')
|
|
||||||
|
|
||||||
# data
|
|
||||||
merged_data = data.copy()
|
|
||||||
model_input = merged_data['entry']
|
|
||||||
candidates_idx = candidates_by_index(
|
|
||||||
data_model_input=model_input,
|
|
||||||
model=model,
|
|
||||||
cos_sim_threshold=cos_sim_threshold,
|
|
||||||
)
|
|
||||||
# graph of similar ids
|
|
||||||
similar_id_graph, _ = similar_index_connection_graph(candidates_idx)
|
|
||||||
|
|
||||||
for similar_id_group in similar_index_groups(similar_id_graph):
|
|
||||||
similar_id_group = list(similar_id_group)
|
|
||||||
similar_data = merged_data.loc[similar_id_group, :]
|
|
||||||
# keep first entry with max number occurrences, then number of
|
|
||||||
# associated objects, then length of entry
|
|
||||||
similar_data = similar_data.sort_values(
|
|
||||||
by=['num_occur', 'num_assoc_obj_ids', 'len'],
|
|
||||||
ascending=[False, False, False],
|
|
||||||
)
|
|
||||||
# merge information to first entry
|
|
||||||
data_idx = cast(PandasIndex, similar_data.index[0])
|
|
||||||
similar_data.at[data_idx, 'num_occur'] = similar_data['num_occur'].sum()
|
|
||||||
assoc_obj_ids = similar_data['assoc_obj_ids'].to_numpy()
|
|
||||||
assoc_obj_ids = np.concatenate(assoc_obj_ids)
|
|
||||||
assoc_obj_ids = np.unique(assoc_obj_ids)
|
|
||||||
similar_data.at[data_idx, 'assoc_obj_ids'] = assoc_obj_ids
|
|
||||||
similar_data.at[data_idx, 'num_assoc_obj_ids'] = len(assoc_obj_ids)
|
|
||||||
# remaining indices, should be removed
|
|
||||||
similar_id_group.remove(data_idx)
|
|
||||||
merged_similar_data = similar_data.drop(index=similar_id_group)
|
|
||||||
# update entry in main dataset, drop remaining entries
|
|
||||||
merged_data.update(merged_similar_data)
|
|
||||||
merged_data = merged_data.drop(index=similar_id_group)
|
|
||||||
|
|
||||||
logger.info('Similarity candidates merged successfully.')
|
|
||||||
|
|
||||||
return (merged_data.copy(),)
|
|
||||||
|
|
||||||
|
|
||||||
# merge duplicates
|
# merge duplicates
|
||||||
def merge_similarity_dupl_old(
|
def merge_similarity_dupl_old(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
|
|||||||
@ -24,13 +24,13 @@ PATH_TO_DATASET: Final[Path] = path_dataset_conf.resolve()
|
|||||||
# if not PATH_TO_DATASET.exists():
|
# if not PATH_TO_DATASET.exists():
|
||||||
# raise FileNotFoundError(f'Dataset path >>{PATH_TO_DATASET}<< does not exist.')
|
# raise FileNotFoundError(f'Dataset path >>{PATH_TO_DATASET}<< does not exist.')
|
||||||
# ** control
|
# ** control
|
||||||
DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
|
# DO_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing']
|
||||||
SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
|
SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip']
|
||||||
DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
|
# DO_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis']
|
||||||
SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
|
SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip']
|
||||||
DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
|
# DO_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing']
|
||||||
SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
|
SKIP_GRAPH_POSTPROCESSING: Final[bool] = CONFIG['control']['graph_postprocessing_skip']
|
||||||
DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
|
# DO_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis']
|
||||||
SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
|
SKIP_TIME_ANALYSIS: Final[bool] = CONFIG['control']['time_analysis_skip']
|
||||||
|
|
||||||
# ** models
|
# ** models
|
||||||
@ -66,11 +66,11 @@ UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
|
|||||||
]
|
]
|
||||||
FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
|
FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
|
||||||
# ** time_analysis.model_input
|
# ** time_analysis.model_input
|
||||||
MODEL_INPUT_FEATURES: Final[tuple[str]] = tuple(
|
MODEL_INPUT_FEATURES: Final[tuple[str, ...]] = tuple(
|
||||||
CONFIG['time_analysis']['model_input']['input_features']
|
CONFIG['time_analysis']['model_input']['input_features']
|
||||||
)
|
)
|
||||||
ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature']
|
ACTIVITY_FEATURE: Final[str] = CONFIG['time_analysis']['model_input']['activity_feature']
|
||||||
ACTIVITY_TYPES: Final[tuple[str]] = tuple(
|
ACTIVITY_TYPES: Final[tuple[str, ...]] = tuple(
|
||||||
CONFIG['time_analysis']['model_input']['activity_types']
|
CONFIG['time_analysis']['model_input']['activity_types']
|
||||||
)
|
)
|
||||||
THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][
|
THRESHOLD_NUM_ACTIVITIES: Final[int] = CONFIG['time_analysis']['model_input'][
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
import os
|
|
||||||
import pickle
|
import pickle
|
||||||
import shutil
|
import shutil
|
||||||
import tomllib
|
import tomllib
|
||||||
@ -22,7 +21,7 @@ def create_saving_folder(
|
|||||||
if overwrite_existing:
|
if overwrite_existing:
|
||||||
# overwrite if desired (deletes whole path and re-creates it)
|
# overwrite if desired (deletes whole path and re-creates it)
|
||||||
shutil.rmtree(saving_path_folder)
|
shutil.rmtree(saving_path_folder)
|
||||||
os.makedirs(saving_path_folder)
|
saving_path_folder.mkdir(parents=True)
|
||||||
else:
|
else:
|
||||||
logger.info(
|
logger.info(
|
||||||
(
|
(
|
||||||
@ -62,56 +61,14 @@ def load_pickle(
|
|||||||
return obj
|
return obj
|
||||||
|
|
||||||
|
|
||||||
# TODO: remove, too specialised for common application
|
def get_entry_point(
|
||||||
"""
|
saving_path: Path,
|
||||||
def filter_candidates_idx(
|
filename: str,
|
||||||
data_model_input: Series,
|
) -> Path:
|
||||||
model: SentenceTransformer,
|
entry_point_path = (saving_path / filename).with_suffix('.pkl')
|
||||||
cos_sim_threshold: float,
|
if not entry_point_path.exists():
|
||||||
) -> Iterator[tuple[PandasIndex, PandasIndex]]:
|
raise FileNotFoundError(
|
||||||
common function to filter candidate indices based on cosine similarity
|
f'Could not find provided entry data under path: >>{entry_point_path}<<'
|
||||||
using SentenceTransformer model in batch mode,
|
|
||||||
feed of data as Series to retain information about indices of entries
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
data_model_input : Series
|
|
||||||
containing indices and text entries to process
|
|
||||||
model : SentenceTransformer
|
|
||||||
necessary SentenceTransformer model to encode text entries
|
|
||||||
cos_sim_threshold : float
|
|
||||||
threshold for cosine similarity to filter candidates
|
|
||||||
|
|
||||||
Yields
|
|
||||||
------
|
|
||||||
Iterator[tuple[PandasIndex, PandasIndex]]
|
|
||||||
index pairs which meet the cosine similarity threshold
|
|
||||||
|
|
||||||
|
|
||||||
# embeddings
|
|
||||||
batch = typing.cast(list[str],
|
|
||||||
data_model_input.to_list())
|
|
||||||
embds = typing.cast(Tensor,
|
|
||||||
model.encode(
|
|
||||||
batch,
|
|
||||||
convert_to_numpy=False,
|
|
||||||
convert_to_tensor=True,
|
|
||||||
show_progress_bar=False,
|
|
||||||
))
|
|
||||||
|
|
||||||
# cosine similarity
|
|
||||||
cos_sim = typing.cast(
|
|
||||||
npt.NDArray,
|
|
||||||
sentence_transformers.util.cos_sim(embds, embds).numpy()
|
|
||||||
)
|
)
|
||||||
np.fill_diagonal(cos_sim, 0.)
|
|
||||||
cos_sim = np.triu(cos_sim)
|
|
||||||
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
|
|
||||||
|
|
||||||
for idx_array in cos_sim_idx:
|
return entry_point_path
|
||||||
idx_pair = typing.cast(
|
|
||||||
tuple[np.int64, np.int64],
|
|
||||||
tuple(data_model_input.index[idx] for idx in idx_array)
|
|
||||||
)
|
|
||||||
yield idx_pair
|
|
||||||
"""
|
|
||||||
|
|||||||
@ -9,14 +9,12 @@ dataset = './01_2_Rohdaten_neu/Export4.csv'
|
|||||||
#results = './results/Export7_trunc/'
|
#results = './results/Export7_trunc/'
|
||||||
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
#dataset = './01_03_Rohdaten_202403/Export7_trunc.csv'
|
||||||
|
|
||||||
|
# only debugging features, production-ready pipelines should always
|
||||||
|
# be fully executed
|
||||||
[control]
|
[control]
|
||||||
preprocessing = true
|
|
||||||
preprocessing_skip = false
|
preprocessing_skip = false
|
||||||
token_analysis = false
|
|
||||||
token_analysis_skip = false
|
token_analysis_skip = false
|
||||||
graph_postprocessing = false
|
|
||||||
graph_postprocessing_skip = false
|
graph_postprocessing_skip = false
|
||||||
time_analysis = false
|
|
||||||
time_analysis_skip = false
|
time_analysis_skip = false
|
||||||
|
|
||||||
#[export_filenames]
|
#[export_filenames]
|
||||||
@ -42,9 +40,12 @@ criterion_feature = 'HObjektText'
|
|||||||
feature_name_obj_id = 'ObjektID'
|
feature_name_obj_id = 'ObjektID'
|
||||||
|
|
||||||
[time_analysis.model_input]
|
[time_analysis.model_input]
|
||||||
|
# input_features = [
|
||||||
|
# 'VorgangsTypName',
|
||||||
|
# 'VorgangsArtText',
|
||||||
|
# 'VorgangsBeschreibung',
|
||||||
|
# ]
|
||||||
input_features = [
|
input_features = [
|
||||||
'VorgangsTypName',
|
|
||||||
'VorgangsArtText',
|
|
||||||
'VorgangsBeschreibung',
|
'VorgangsBeschreibung',
|
||||||
]
|
]
|
||||||
activity_feature = 'VorgangsTypName'
|
activity_feature = 'VorgangsTypName'
|
||||||
|
|||||||
@ -1,9 +1,14 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
from collections.abc import Callable
|
from collections.abc import Callable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any, Never, cast
|
||||||
|
from typing_extensions import override
|
||||||
|
|
||||||
from lang_main.loggers import logger_pipelines as logger
|
|
||||||
from lang_main.io import load_pickle, save_pickle
|
from lang_main.io import load_pickle, save_pickle
|
||||||
|
from lang_main.loggers import logger_pipelines as logger
|
||||||
|
from lang_main.types import ResultHandling
|
||||||
|
|
||||||
# ** pipelines to perform given actions on dataset in a customisable manner
|
# ** pipelines to perform given actions on dataset in a customisable manner
|
||||||
|
|
||||||
@ -12,7 +17,18 @@ class NoPerformableActionError(Exception):
|
|||||||
"""Error describing that no action is available in the current pipeline"""
|
"""Error describing that no action is available in the current pipeline"""
|
||||||
|
|
||||||
|
|
||||||
class BasePipeline:
|
class WrongActionTypeError(Exception):
|
||||||
|
"""Error raised if added action type is not supported by corresponding pipeline"""
|
||||||
|
|
||||||
|
|
||||||
|
class OutputInPipelineContainerError(Exception):
|
||||||
|
"""Error raised if an output was detected by one of the performed
|
||||||
|
actions in a PipelineContainer. Each action in a PipelineContainer is itself a
|
||||||
|
procedure which does not have any parameters or return values and should therefore not
|
||||||
|
return any values."""
|
||||||
|
|
||||||
|
|
||||||
|
class BasePipeline(ABC):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
name: str,
|
name: str,
|
||||||
@ -25,18 +41,12 @@ class BasePipeline:
|
|||||||
self.name = name
|
self.name = name
|
||||||
# working directory for pipeline == output path
|
# working directory for pipeline == output path
|
||||||
self.working_dir = working_dir
|
self.working_dir = working_dir
|
||||||
# if not self.working_dir.exists():
|
|
||||||
# self.working_dir.mkdir(parents=True)
|
|
||||||
|
|
||||||
# container for actions to perform during pass
|
# container for actions to perform during pass
|
||||||
self.actions: list[Callable] = []
|
self.actions: list[Callable] = []
|
||||||
self.action_names: list[str] = []
|
self.action_names: list[str] = []
|
||||||
self.actions_kwargs: list[dict[str, Any]] = []
|
|
||||||
self.is_save_result: list[bool] = []
|
|
||||||
# progress tracking, start at 1
|
# progress tracking, start at 1
|
||||||
self.curr_proc_idx: int = 1
|
self.curr_proc_idx: int = 1
|
||||||
# intermediate result
|
|
||||||
self._intermediate_result: Any | None = None
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return (
|
return (
|
||||||
@ -44,15 +54,132 @@ class BasePipeline:
|
|||||||
f'working dir: {self.working_dir}, contents: {self.action_names})'
|
f'working dir: {self.working_dir}, contents: {self.action_names})'
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
def panic_wrong_action_type(
|
||||||
def intermediate_result(self) -> Any:
|
self,
|
||||||
return self._intermediate_result
|
action: Any,
|
||||||
|
compatible_type: str,
|
||||||
|
) -> Never:
|
||||||
|
raise WrongActionTypeError(
|
||||||
|
(
|
||||||
|
f'Action must be of type {compatible_type}, '
|
||||||
|
f'but is of type >>{type(action)}<<.'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def prep_run(self) -> None:
|
||||||
|
logger.info('Starting pipeline >>%s<<...', self.name)
|
||||||
|
# progress tracking
|
||||||
|
self.curr_proc_idx = 1
|
||||||
|
# check if performable actions available
|
||||||
|
if len(self.actions) == 0:
|
||||||
|
raise NoPerformableActionError(
|
||||||
|
'The pipeline does not contain any performable actions.'
|
||||||
|
)
|
||||||
|
|
||||||
|
def post_run(self) -> None:
|
||||||
|
logger.info(
|
||||||
|
'Processing pipeline >>%s<< successfully ended after %d steps.',
|
||||||
|
self.name,
|
||||||
|
(self.curr_proc_idx - 1),
|
||||||
|
)
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def add(self) -> None: ...
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def logic(self) -> None: ...
|
||||||
|
|
||||||
|
def run(self, *args, **kwargs) -> Any:
|
||||||
|
self.prep_run()
|
||||||
|
ret = self.logic(*args, **kwargs)
|
||||||
|
self.post_run()
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineContainer(BasePipeline):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
working_dir: Path,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(name=name, working_dir=working_dir)
|
||||||
|
|
||||||
|
self.action_skip: list[bool] = []
|
||||||
|
|
||||||
|
@override
|
||||||
|
def add(
|
||||||
|
self,
|
||||||
|
action: Callable,
|
||||||
|
skip: bool = False,
|
||||||
|
) -> None:
|
||||||
|
if isinstance(action, Callable):
|
||||||
|
self.actions.append(action)
|
||||||
|
self.action_names.append(action.__name__)
|
||||||
|
self.action_skip.append(skip)
|
||||||
|
else:
|
||||||
|
self.panic_wrong_action_type(action=action, compatible_type=Callable.__name__)
|
||||||
|
|
||||||
|
@override
|
||||||
|
def logic(self) -> None:
|
||||||
|
for idx, (action, action_name) in enumerate(zip(self.actions, self.action_names)):
|
||||||
|
# loading
|
||||||
|
if self.action_skip[idx]:
|
||||||
|
logger.info('[No Calculation] Skipping >>%s<<...', action_name)
|
||||||
|
self.curr_proc_idx += 1
|
||||||
|
continue
|
||||||
|
# calculation
|
||||||
|
ret = action()
|
||||||
|
if ret is not None:
|
||||||
|
raise OutputInPipelineContainerError(
|
||||||
|
(
|
||||||
|
f'Output in PipelineContainers not allowed. Action {action_name} '
|
||||||
|
f'returned values in Container {self.name}.'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# processing tracking
|
||||||
|
self.curr_proc_idx += 1
|
||||||
|
|
||||||
|
|
||||||
|
class Pipeline(BasePipeline):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
working_dir: Path,
|
||||||
|
) -> None:
|
||||||
|
# init base class
|
||||||
|
super().__init__(name=name, working_dir=working_dir)
|
||||||
|
|
||||||
|
# name of pipeline
|
||||||
|
self.name = name
|
||||||
|
# working directory for pipeline == output path
|
||||||
|
self.working_dir = working_dir
|
||||||
|
# if not self.working_dir.exists():
|
||||||
|
# self.working_dir.mkdir(parents=True)
|
||||||
|
|
||||||
|
# container for actions to perform during pass
|
||||||
|
self.actions_kwargs: list[dict[str, Any]] = []
|
||||||
|
self.save_results: ResultHandling = []
|
||||||
|
self.load_results: ResultHandling = []
|
||||||
|
# intermediate result
|
||||||
|
self._intermediate_result: tuple[Any, ...] | None = None
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return (
|
||||||
|
f'{self.__class__.__name__}(name: {self.name}, '
|
||||||
|
f'working dir: {self.working_dir}, contents: {self.action_names})'
|
||||||
|
)
|
||||||
|
|
||||||
|
# @property
|
||||||
|
# def intermediate_result(self) -> tuple[Any, ...] | None:
|
||||||
|
# return self._intermediate_result
|
||||||
|
@override
|
||||||
def add(
|
def add(
|
||||||
self,
|
self,
|
||||||
action: Callable,
|
action: Callable,
|
||||||
action_kwargs: dict[str, Any] = {},
|
action_kwargs: dict[str, Any] = {},
|
||||||
save_result: bool = False,
|
save_result: bool = False,
|
||||||
|
load_result: bool = False,
|
||||||
|
filename: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
# check explicitly for function type
|
# check explicitly for function type
|
||||||
# if isinstance(action, FunctionType):
|
# if isinstance(action, FunctionType):
|
||||||
@ -60,11 +187,10 @@ class BasePipeline:
|
|||||||
self.actions.append(action)
|
self.actions.append(action)
|
||||||
self.action_names.append(action.__name__)
|
self.action_names.append(action.__name__)
|
||||||
self.actions_kwargs.append(action_kwargs.copy())
|
self.actions_kwargs.append(action_kwargs.copy())
|
||||||
self.is_save_result.append(save_result)
|
self.save_results.append((save_result, filename))
|
||||||
|
self.load_results.append((load_result, filename))
|
||||||
else:
|
else:
|
||||||
raise TypeError(
|
self.panic_wrong_action_type(action=action, compatible_type=Callable.__name__)
|
||||||
f'Action must be custom function, but is of type >>{type(action)}<<.'
|
|
||||||
)
|
|
||||||
|
|
||||||
# TODO: add multiple entries by utilising simple add method
|
# TODO: add multiple entries by utilising simple add method
|
||||||
"""
|
"""
|
||||||
@ -88,57 +214,84 @@ class BasePipeline:
|
|||||||
f"but is of type >>{type(action)}<<."))
|
f"but is of type >>{type(action)}<<."))
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def save_curr_result(
|
def get_result_path(
|
||||||
self,
|
self,
|
||||||
filename: str,
|
action_idx: int,
|
||||||
|
filename: str | None,
|
||||||
|
) -> tuple[Path, str]:
|
||||||
|
action_name = self.action_names[action_idx]
|
||||||
|
if filename is None:
|
||||||
|
target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_{action_name}'
|
||||||
|
else:
|
||||||
|
target_filename = filename
|
||||||
|
target_path = self.working_dir.joinpath(target_filename).with_suffix('.pkl')
|
||||||
|
return target_path, action_name
|
||||||
|
|
||||||
|
def load_step(
|
||||||
|
self,
|
||||||
|
action_idx: int,
|
||||||
|
filename: str | None,
|
||||||
|
) -> tuple[Any, ...]:
|
||||||
|
target_path, action_name = self.get_result_path(action_idx, filename)
|
||||||
|
|
||||||
|
if not target_path.exists():
|
||||||
|
raise FileNotFoundError(
|
||||||
|
(
|
||||||
|
f'No intermediate results for action >>{action_name}<< '
|
||||||
|
f'under >>{target_path}<< found'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# results should be tuple, but that is not guaranteed
|
||||||
|
result_loaded = cast(tuple[Any, ...], load_pickle(target_path))
|
||||||
|
if not isinstance(result_loaded, tuple):
|
||||||
|
raise TypeError(f'Loaded results must be tuple, not {type(result_loaded)}')
|
||||||
|
|
||||||
|
return result_loaded
|
||||||
|
|
||||||
|
def save_step(
|
||||||
|
self,
|
||||||
|
action_idx: int,
|
||||||
|
filename: str | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename
|
# target_filename = f'Pipe-{self.name}_Step-{self.curr_proc_idx}_' + filename
|
||||||
target_path = self.working_dir.joinpath(target_filename)
|
# target_path = self.working_dir.joinpath(target_filename)
|
||||||
target_path = target_path.with_suffix('.pkl')
|
# target_path = target_path.with_suffix('.pkl')
|
||||||
|
target_path, _ = self.get_result_path(action_idx, filename)
|
||||||
# saving file locally
|
# saving file locally
|
||||||
save_pickle(obj=self._intermediate_result, path=target_path)
|
save_pickle(obj=self._intermediate_result, path=target_path)
|
||||||
|
|
||||||
def load_intermediate_result(
|
@override
|
||||||
self,
|
def logic(
|
||||||
saving_path: str,
|
|
||||||
filename: str,
|
|
||||||
) -> tuple[Any, ...]:
|
|
||||||
target_path = Path(saving_path + filename).with_suffix('.pkl')
|
|
||||||
# loading DataFrame or Series from pickle
|
|
||||||
data = load_pickle(target_path)
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
def prep_run(self) -> None:
|
|
||||||
logger.info('Starting processing pipeline >>%s<<...', self.name)
|
|
||||||
# progress tracking
|
|
||||||
self.curr_proc_idx = 1
|
|
||||||
# check if performable actions available
|
|
||||||
if len(self.actions) == 0:
|
|
||||||
raise NoPerformableActionError(
|
|
||||||
'The pipeline does not contain any performable actions.'
|
|
||||||
)
|
|
||||||
|
|
||||||
def run(
|
|
||||||
self,
|
self,
|
||||||
starting_values: tuple[Any, ...],
|
starting_values: tuple[Any, ...],
|
||||||
) -> tuple[Any, ...]:
|
) -> tuple[Any, ...]:
|
||||||
# prepare start
|
|
||||||
self.prep_run()
|
|
||||||
|
|
||||||
for idx, (action, action_kwargs) in enumerate(zip(self.actions, self.actions_kwargs)):
|
for idx, (action, action_kwargs) in enumerate(zip(self.actions, self.actions_kwargs)):
|
||||||
|
# loading
|
||||||
|
if self.load_results[idx][0]:
|
||||||
|
filename = self.load_results[idx][1]
|
||||||
|
ret = self.load_step(action_idx=idx, filename=filename)
|
||||||
|
logger.info(
|
||||||
|
'[No Calculation] Loaded result for action >>%s<< successfully',
|
||||||
|
self.action_names[idx],
|
||||||
|
)
|
||||||
|
self.curr_proc_idx += 1
|
||||||
|
continue
|
||||||
|
# calculation
|
||||||
if idx == 0:
|
if idx == 0:
|
||||||
ret = action(*starting_values, **action_kwargs)
|
ret = action(*starting_values, **action_kwargs)
|
||||||
else:
|
else:
|
||||||
ret = action(*ret, **action_kwargs)
|
ret = action(*ret, **action_kwargs)
|
||||||
|
|
||||||
|
if not isinstance(ret, tuple):
|
||||||
|
ret = (ret,)
|
||||||
|
ret = cast(tuple[Any, ...], ret)
|
||||||
# save intermediate result
|
# save intermediate result
|
||||||
self._intermediate_result = ret
|
self._intermediate_result = ret
|
||||||
# check if result should be saved locally
|
# saving result locally, always save last action
|
||||||
if self.is_save_result[idx]:
|
if self.save_results[idx][0] or idx == (len(self.actions) - 1):
|
||||||
self.save_curr_result(filename=self.action_names[idx])
|
filename = self.save_results[idx][1]
|
||||||
|
self.save_step(action_idx=idx, filename=filename)
|
||||||
# processing tracking
|
# processing tracking
|
||||||
self.curr_proc_idx += 1
|
self.curr_proc_idx += 1
|
||||||
|
|
||||||
logger.info('Processing pipeline >>%s<< successfully ended.', self.name)
|
|
||||||
|
|
||||||
return ret
|
return ret
|
||||||
|
|||||||
@ -1,9 +1,11 @@
|
|||||||
|
from lang_main.analysis import graphs
|
||||||
from lang_main.analysis.preprocessing import (
|
from lang_main.analysis.preprocessing import (
|
||||||
analyse_feature,
|
analyse_feature,
|
||||||
clean_string_slim,
|
clean_string_slim,
|
||||||
entry_wise_cleansing,
|
entry_wise_cleansing,
|
||||||
load_raw_data,
|
load_raw_data,
|
||||||
merge_similarity_dupl,
|
merge_similarity_dupl,
|
||||||
|
numeric_pre_filter_feature,
|
||||||
remove_duplicates,
|
remove_duplicates,
|
||||||
remove_NA,
|
remove_NA,
|
||||||
)
|
)
|
||||||
@ -23,40 +25,50 @@ from lang_main.constants import (
|
|||||||
SAVE_PATH_FOLDER,
|
SAVE_PATH_FOLDER,
|
||||||
SPCY_MODEL,
|
SPCY_MODEL,
|
||||||
STFR_MODEL,
|
STFR_MODEL,
|
||||||
|
THRESHOLD_AMOUNT_CHARACTERS,
|
||||||
|
THRESHOLD_EDGE_WEIGHT,
|
||||||
THRESHOLD_NUM_ACTIVITIES,
|
THRESHOLD_NUM_ACTIVITIES,
|
||||||
THRESHOLD_SIMILARITY,
|
THRESHOLD_SIMILARITY,
|
||||||
THRESHOLD_TIMELINE_SIMILARITY,
|
THRESHOLD_TIMELINE_SIMILARITY,
|
||||||
THRESHOLD_UNIQUE_TEXTS,
|
THRESHOLD_UNIQUE_TEXTS,
|
||||||
UNIQUE_CRITERION_FEATURE,
|
UNIQUE_CRITERION_FEATURE,
|
||||||
)
|
)
|
||||||
from lang_main.pipelines.base import BasePipeline
|
from lang_main.pipelines.base import Pipeline
|
||||||
|
from lang_main.types import EntryPoints
|
||||||
|
|
||||||
|
|
||||||
# ** pipeline configuration
|
# ** pipeline configuration
|
||||||
# ** target feature preparation
|
# ** target feature preparation
|
||||||
pipe_target_feat = BasePipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
|
def build_base_target_feature_pipe() -> Pipeline:
|
||||||
pipe_target_feat.add(
|
pipe_target_feat = Pipeline(name='TargetFeature', working_dir=SAVE_PATH_FOLDER)
|
||||||
|
pipe_target_feat.add(
|
||||||
load_raw_data,
|
load_raw_data,
|
||||||
{
|
{
|
||||||
'date_cols': DATE_COLS,
|
'date_cols': DATE_COLS,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
pipe_target_feat.add(remove_duplicates)
|
pipe_target_feat.add(remove_duplicates)
|
||||||
pipe_target_feat.add(remove_NA, save_result=True)
|
pipe_target_feat.add(remove_NA, save_result=True)
|
||||||
pipe_target_feat.add(
|
pipe_target_feat.add(
|
||||||
entry_wise_cleansing,
|
entry_wise_cleansing,
|
||||||
{
|
{
|
||||||
'target_feature': 'VorgangsBeschreibung',
|
'target_feature': 'VorgangsBeschreibung',
|
||||||
'cleansing_func': clean_string_slim,
|
'cleansing_func': clean_string_slim,
|
||||||
},
|
},
|
||||||
save_result=True,
|
save_result=True,
|
||||||
)
|
filename=EntryPoints.TIMELINE,
|
||||||
pipe_target_feat.add(
|
)
|
||||||
|
pipe_target_feat.add(
|
||||||
analyse_feature,
|
analyse_feature,
|
||||||
{
|
{
|
||||||
'target_feature': 'VorgangsBeschreibung',
|
'target_feature': 'VorgangsBeschreibung',
|
||||||
},
|
},
|
||||||
save_result=True,
|
save_result=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return pipe_target_feat
|
||||||
|
|
||||||
|
|
||||||
# output: DataFrame containing target feature with
|
# output: DataFrame containing target feature with
|
||||||
# number of occurrences and associated ObjectIDs
|
# number of occurrences and associated ObjectIDs
|
||||||
|
|
||||||
@ -81,37 +93,80 @@ pipe_target_feat.add(
|
|||||||
# save_result=True,
|
# save_result=True,
|
||||||
# )
|
# )
|
||||||
|
|
||||||
|
|
||||||
# ** Merge duplicates
|
# ** Merge duplicates
|
||||||
pipe_merge = BasePipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
|
def build_merge_duplicates_pipe() -> Pipeline:
|
||||||
# pipe_merge.add(merge_similarity_dupl, save_result=True)
|
pipe_merge = Pipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
|
||||||
pipe_merge.add(
|
# pipe_merge.add(merge_similarity_dupl, save_result=True)
|
||||||
|
pipe_merge.add(
|
||||||
|
numeric_pre_filter_feature,
|
||||||
|
{
|
||||||
|
'feature': 'len',
|
||||||
|
'bound_lower': THRESHOLD_AMOUNT_CHARACTERS,
|
||||||
|
'bound_upper': None,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
pipe_merge.add(
|
||||||
merge_similarity_dupl,
|
merge_similarity_dupl,
|
||||||
{
|
{
|
||||||
'model': STFR_MODEL,
|
'model': STFR_MODEL,
|
||||||
'cos_sim_threshold': THRESHOLD_SIMILARITY,
|
'cos_sim_threshold': THRESHOLD_SIMILARITY,
|
||||||
},
|
},
|
||||||
save_result=True,
|
save_result=True,
|
||||||
)
|
filename=EntryPoints.TOKEN_ANALYSIS,
|
||||||
|
)
|
||||||
|
|
||||||
|
return pipe_merge
|
||||||
|
|
||||||
|
|
||||||
# ** token analysis
|
# ** token analysis
|
||||||
pipe_token_analysis = BasePipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
|
def build_tk_graph_pipe() -> Pipeline:
|
||||||
pipe_token_analysis.add(
|
pipe_token_analysis = Pipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
|
||||||
|
pipe_token_analysis.add(
|
||||||
build_token_graph,
|
build_token_graph,
|
||||||
{
|
{
|
||||||
'model': SPCY_MODEL,
|
'model': SPCY_MODEL,
|
||||||
'target_feature': 'entry',
|
'target_feature': 'entry',
|
||||||
'weights_feature': 'num_occur',
|
'weights_feature': 'num_occur',
|
||||||
'batch_idx_feature': 'batched_idxs',
|
'batch_idx_feature': 'batched_idxs',
|
||||||
'build_map': True,
|
'build_map': False,
|
||||||
'batch_size_model': 50,
|
'batch_size_model': 50,
|
||||||
},
|
},
|
||||||
save_result=True,
|
save_result=True,
|
||||||
)
|
filename=EntryPoints.TK_GRAPH_POST,
|
||||||
|
)
|
||||||
|
|
||||||
|
return pipe_token_analysis
|
||||||
|
|
||||||
|
|
||||||
|
def build_tk_graph_post_pipe() -> Pipeline:
|
||||||
|
pipe_graph_postprocessing = Pipeline(
|
||||||
|
name='Graph_Postprocessing', working_dir=SAVE_PATH_FOLDER
|
||||||
|
)
|
||||||
|
pipe_graph_postprocessing.add(
|
||||||
|
graphs.filter_graph_by_edge_weight,
|
||||||
|
{
|
||||||
|
'bound_lower': THRESHOLD_EDGE_WEIGHT,
|
||||||
|
'bound_upper': None,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
pipe_graph_postprocessing.add(
|
||||||
|
graphs.filter_graph_by_node_degree,
|
||||||
|
{
|
||||||
|
'bound_lower': 1,
|
||||||
|
'bound_upper': None,
|
||||||
|
},
|
||||||
|
save_result=True,
|
||||||
|
filename=EntryPoints.TK_GRAPH_ANALYSIS,
|
||||||
|
)
|
||||||
|
|
||||||
|
return pipe_graph_postprocessing
|
||||||
|
|
||||||
|
|
||||||
# ** timeline analysis
|
# ** timeline analysis
|
||||||
pipe_timeline = BasePipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
|
def build_timeline_pipe() -> Pipeline:
|
||||||
pipe_timeline.add(
|
pipe_timeline = Pipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
|
||||||
|
pipe_timeline.add(
|
||||||
remove_non_relevant_obj_ids,
|
remove_non_relevant_obj_ids,
|
||||||
{
|
{
|
||||||
'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
|
'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
|
||||||
@ -119,15 +174,15 @@ pipe_timeline.add(
|
|||||||
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
||||||
},
|
},
|
||||||
save_result=True,
|
save_result=True,
|
||||||
)
|
)
|
||||||
pipe_timeline.add(
|
pipe_timeline.add(
|
||||||
generate_model_input,
|
generate_model_input,
|
||||||
{
|
{
|
||||||
'target_feature_name': 'nlp_model_input',
|
'target_feature_name': 'nlp_model_input',
|
||||||
'model_input_features': MODEL_INPUT_FEATURES,
|
'model_input_features': MODEL_INPUT_FEATURES,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
pipe_timeline.add(
|
pipe_timeline.add(
|
||||||
filter_activities_per_obj_id,
|
filter_activities_per_obj_id,
|
||||||
{
|
{
|
||||||
'activity_feature': ACTIVITY_FEATURE,
|
'activity_feature': ACTIVITY_FEATURE,
|
||||||
@ -135,8 +190,8 @@ pipe_timeline.add(
|
|||||||
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
||||||
'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
|
'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
pipe_timeline.add(
|
pipe_timeline.add(
|
||||||
get_timeline_candidates,
|
get_timeline_candidates,
|
||||||
{
|
{
|
||||||
'model': STFR_MODEL,
|
'model': STFR_MODEL,
|
||||||
@ -145,4 +200,7 @@ pipe_timeline.add(
|
|||||||
'model_input_feature': 'nlp_model_input',
|
'model_input_feature': 'nlp_model_input',
|
||||||
},
|
},
|
||||||
save_result=True,
|
save_result=True,
|
||||||
)
|
filename=EntryPoints.TIMELINE_POST,
|
||||||
|
)
|
||||||
|
|
||||||
|
return pipe_timeline
|
||||||
|
|||||||
@ -17,8 +17,20 @@ class LoggingLevels(enum.IntEnum):
|
|||||||
|
|
||||||
# ** devices
|
# ** devices
|
||||||
class STFRDeviceTypes(enum.StrEnum):
|
class STFRDeviceTypes(enum.StrEnum):
|
||||||
CPU = 'cpu'
|
CPU = enum.auto()
|
||||||
GPU = 'cuda'
|
GPU = enum.auto()
|
||||||
|
|
||||||
|
|
||||||
|
# ** pipelines
|
||||||
|
ResultHandling: TypeAlias = list[tuple[bool, str | None]]
|
||||||
|
|
||||||
|
|
||||||
|
class EntryPoints(enum.StrEnum):
|
||||||
|
TIMELINE = 'TIMELINE'
|
||||||
|
TIMELINE_POST = 'TIMELINE_POSTPROCESSING'
|
||||||
|
TK_GRAPH_POST = 'TK-GRAPH_POSTPROCESSING'
|
||||||
|
TK_GRAPH_ANALYSIS = 'TK-GRAPH_ANALYSIS'
|
||||||
|
TOKEN_ANALYSIS = 'TOKEN_ANALYSIS'
|
||||||
|
|
||||||
|
|
||||||
# ** datasets
|
# ** datasets
|
||||||
|
|||||||
1687
test-notebooks/misc.ipynb
Normal file
1687
test-notebooks/misc.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user