sandboxing
This commit is contained in:
parent
9197146d2c
commit
3f58a14852
@ -1,9 +1,11 @@
|
|||||||
import time
|
import time
|
||||||
import webbrowser
|
import webbrowser
|
||||||
from pathlib import Path
|
from collections.abc import Collection, Iterable
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
from typing import Any, Final, cast
|
from typing import Any, Final, cast
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
# import dash_cytoscape as cyto
|
# import dash_cytoscape as cyto
|
||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
from dash import (
|
from dash import (
|
||||||
@ -21,20 +23,37 @@ from plotly.graph_objects import Figure
|
|||||||
|
|
||||||
import lang_main.io
|
import lang_main.io
|
||||||
from lang_main.analysis import graphs, tokens
|
from lang_main.analysis import graphs, tokens
|
||||||
from lang_main.constants import SAVE_PATH_FOLDER, SPCY_MODEL
|
from lang_main.analysis.timeline import (
|
||||||
|
calc_delta_to_next_failure,
|
||||||
|
filter_timeline_cands,
|
||||||
|
)
|
||||||
|
from lang_main.constants import (
|
||||||
|
NAME_DELTA_FEAT_TO_NEXT_FAILURE,
|
||||||
|
NAME_DELTA_FEAT_TO_REPAIR,
|
||||||
|
SAVE_PATH_FOLDER,
|
||||||
|
SPCY_MODEL,
|
||||||
|
)
|
||||||
from lang_main.errors import EmptyEdgesError, EmptyGraphError
|
from lang_main.errors import EmptyEdgesError, EmptyGraphError
|
||||||
from lang_main.pipelines.predefined import (
|
from lang_main.pipelines.predefined import (
|
||||||
build_tk_graph_render_pipe,
|
build_tk_graph_render_pipe,
|
||||||
build_tk_graph_rescaling_pipe,
|
build_tk_graph_rescaling_pipe,
|
||||||
)
|
)
|
||||||
from lang_main.types import EntryPoints, ObjectID, TimelineCandidates
|
from lang_main.types import (
|
||||||
|
DataFrameTLFiltered,
|
||||||
|
EntryPoints,
|
||||||
|
HTMLColumns,
|
||||||
|
HTMLTable,
|
||||||
|
ObjectID,
|
||||||
|
TimelineCandidates,
|
||||||
|
)
|
||||||
|
|
||||||
# ** data
|
# ** data
|
||||||
# p_df = Path(r'../results/test_20240619/TIMELINE.pkl').resolve()
|
# p_df = Path(r'../results/test_20240619/TIMELINE.pkl').resolve()
|
||||||
p_df = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE)
|
p_df = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_POST)
|
||||||
(data,) = cast(tuple[DataFrame], lang_main.io.load_pickle(p_df))
|
(data,) = cast(tuple[DataFrame], lang_main.io.load_pickle(p_df))
|
||||||
|
# data = cleanup_descriptions(data, properties=['ErledigungsBeschreibung'])
|
||||||
# p_tl = Path(r'../results/test_20240619/TIMELINE_POSTPROCESSING.pkl').resolve()
|
# p_tl = Path(r'../results/test_20240619/TIMELINE_POSTPROCESSING.pkl').resolve()
|
||||||
p_tl = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_POST)
|
p_tl = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_CANDS)
|
||||||
cands, texts = cast(
|
cands, texts = cast(
|
||||||
tuple[TimelineCandidates, dict[ObjectID, str]], lang_main.io.load_pickle(p_tl)
|
tuple[TimelineCandidates, dict[ObjectID, str]], lang_main.io.load_pickle(p_tl)
|
||||||
)
|
)
|
||||||
@ -56,17 +75,27 @@ PTH_RENDERED_GRAPH = lang_main.io.get_entry_point(
|
|||||||
file_ext='.svg',
|
file_ext='.svg',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# NAME_DELTA_FEAT_TO_NEXT_FAILURE: Final[str] = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
|
||||||
TABLE_FEATS: Final[list[str]] = [
|
TABLE_FEATS_OVERVIEW: Final[list[str]] = [
|
||||||
'ErstellungsDatum',
|
'ErstellungsDatum',
|
||||||
'ErledigungsDatum',
|
'ErledigungsDatum',
|
||||||
|
NAME_DELTA_FEAT_TO_REPAIR,
|
||||||
'VorgangsTypName',
|
'VorgangsTypName',
|
||||||
'VorgangsBeschreibung',
|
'VorgangsBeschreibung',
|
||||||
|
'ErledigungsBeschreibung',
|
||||||
]
|
]
|
||||||
TABLE_FEATS_DATES: Final[list[str]] = [
|
TABLE_FEATS_DATES: Final[list[str]] = [
|
||||||
'ErstellungsDatum',
|
'ErstellungsDatum',
|
||||||
'ErledigungsDatum',
|
'ErledigungsDatum',
|
||||||
]
|
]
|
||||||
|
TABLE_FEATS_BEST_ACTIONS: Final[list[str]] = [
|
||||||
|
'ErstellungsDatum',
|
||||||
|
'ErledigungsDatum',
|
||||||
|
'VorgangsTypName',
|
||||||
|
'VorgangsBeschreibung',
|
||||||
|
'ErledigungsBeschreibung',
|
||||||
|
NAME_DELTA_FEAT_TO_NEXT_FAILURE,
|
||||||
|
]
|
||||||
|
|
||||||
# ** figure config
|
# ** figure config
|
||||||
MARKERS_OCCURRENCES: Final[dict[str, Any]] = {
|
MARKERS_OCCURRENCES: Final[dict[str, Any]] = {
|
||||||
@ -86,13 +115,15 @@ HOVER_DATA: Final[dict[str, Any]] = {
|
|||||||
'ErstellungsDatum': '|%d.%m.%Y',
|
'ErstellungsDatum': '|%d.%m.%Y',
|
||||||
'ErledigungsDatum': '|%d.%m.%Y',
|
'ErledigungsDatum': '|%d.%m.%Y',
|
||||||
'VorgangsBeschreibung': True,
|
'VorgangsBeschreibung': True,
|
||||||
|
'ErledigungsBeschreibung': True,
|
||||||
}
|
}
|
||||||
HOVER_DATA_DELTA: Final[dict[str, Any]] = {
|
HOVER_DATA_DELTA: Final[dict[str, Any]] = {
|
||||||
'ErstellungsDatum': '|%d.%m.%Y',
|
'ErstellungsDatum': '|%d.%m.%Y',
|
||||||
'ErledigungsDatum': '|%d.%m.%Y',
|
'ErledigungsDatum': '|%d.%m.%Y',
|
||||||
'VorgangsDatum': '|%d.%m.%Y',
|
'VorgangsDatum': '|%d.%m.%Y',
|
||||||
'delta': True,
|
NAME_DELTA_FEAT_TO_REPAIR: True,
|
||||||
'VorgangsBeschreibung': True,
|
'VorgangsBeschreibung': True,
|
||||||
|
'ErledigungsBeschreibung': True,
|
||||||
}
|
}
|
||||||
|
|
||||||
# ** graph
|
# ** graph
|
||||||
@ -136,10 +167,10 @@ graph_layout = html.Div(
|
|||||||
html.Img(
|
html.Img(
|
||||||
id='static-graph-img',
|
id='static-graph-img',
|
||||||
alt='static rendered graph',
|
alt='static rendered graph',
|
||||||
# style={
|
style={
|
||||||
# 'width': 'auto',
|
'width': 'auto',
|
||||||
# 'height': 'auto',
|
'height': 'auto',
|
||||||
# },
|
},
|
||||||
),
|
),
|
||||||
html.P(id='info-graph-errors', children=[]),
|
html.P(id='info-graph-errors', children=[]),
|
||||||
],
|
],
|
||||||
@ -186,7 +217,27 @@ app.layout = html.Div(
|
|||||||
]
|
]
|
||||||
),
|
),
|
||||||
html.Div(
|
html.Div(
|
||||||
[dash_table.DataTable(id='table-candidates')], style={'marginBottom': '2em'}
|
children=[
|
||||||
|
html.Div(
|
||||||
|
[
|
||||||
|
html.H5('Überblick ähnlicher Vorgänge'),
|
||||||
|
dash_table.DataTable(id='table-candidates'),
|
||||||
|
],
|
||||||
|
style={'paddingBottom': '1em'},
|
||||||
|
),
|
||||||
|
html.Div(
|
||||||
|
[
|
||||||
|
html.H5(
|
||||||
|
(
|
||||||
|
'Maßnahmen sortiert nach längstem Zeitraum '
|
||||||
|
'bis zum nächsten Ereignis'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
dash_table.DataTable(id='table-best-actions'),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
],
|
||||||
|
style={'marginBottom': '2em', 'padding': '2em'},
|
||||||
),
|
),
|
||||||
graph_layout,
|
graph_layout,
|
||||||
],
|
],
|
||||||
@ -222,20 +273,21 @@ def update_choice_candidates(obj_id):
|
|||||||
|
|
||||||
|
|
||||||
# ** helpers to filter DataFrame
|
# ** helpers to filter DataFrame
|
||||||
def pre_filter_data(
|
def filter_candidates(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
idx: int,
|
idx: int,
|
||||||
obj_id: ObjectID,
|
obj_id: ObjectID,
|
||||||
) -> DataFrame:
|
) -> DataFrameTLFiltered:
|
||||||
|
# assert correct data type because of Dash
|
||||||
idx = int(idx)
|
idx = int(idx)
|
||||||
obj_id = int(obj_id)
|
obj_id = int(obj_id)
|
||||||
# data = data.copy()
|
|
||||||
cands_for_obj_id = cands[obj_id]
|
data = filter_timeline_cands(
|
||||||
cands_choice = cands_for_obj_id[int(idx) - 1]
|
data=data,
|
||||||
# data
|
cands=cands,
|
||||||
data = data.loc[list(cands_choice)].sort_index() # type: ignore
|
obj_id=obj_id,
|
||||||
data['delta'] = data['ErledigungsDatum'] - data['ErstellungsDatum']
|
entry_idx=(idx - 1), # idx in Dashboard starts with 1
|
||||||
data['delta'] = data['delta'].dt.days
|
)
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@ -258,10 +310,10 @@ def update_timeline(index, obj_id):
|
|||||||
obj_text = texts[obj_id]
|
obj_text = texts[obj_id]
|
||||||
title_occurrences = f'HObjektText: {obj_text}'
|
title_occurrences = f'HObjektText: {obj_text}'
|
||||||
title_delta = f'HObjektText: {obj_text}, Differenz Erstellung und Erledigung'
|
title_delta = f'HObjektText: {obj_text}, Differenz Erstellung und Erledigung'
|
||||||
df = pre_filter_data(data, idx=index, obj_id=obj_id)
|
df = filter_candidates(data, idx=index, obj_id=obj_id)
|
||||||
# figure
|
# figure
|
||||||
fig_occurrences = fig_timeline_occurrences(df, title_occurrences)
|
fig_occurrences = fig_timeline_occurrences(df, title_occurrences)
|
||||||
fig_delta = fig_timeline_delta(df, title_delta)
|
fig_delta = fig_timeline_delta(df, title_delta, delta_feature=NAME_DELTA_FEAT_TO_REPAIR)
|
||||||
|
|
||||||
return fig_occurrences, fig_delta
|
return fig_occurrences, fig_delta
|
||||||
|
|
||||||
@ -293,11 +345,12 @@ def fig_timeline_occurrences(
|
|||||||
def fig_timeline_delta(
|
def fig_timeline_delta(
|
||||||
df: DataFrame,
|
df: DataFrame,
|
||||||
title: str,
|
title: str,
|
||||||
|
delta_feature: str,
|
||||||
) -> Figure:
|
) -> Figure:
|
||||||
fig = px.scatter(
|
fig = px.scatter(
|
||||||
data_frame=df,
|
data_frame=df,
|
||||||
x='ErstellungsDatum',
|
x='ErstellungsDatum',
|
||||||
y='delta',
|
y=delta_feature,
|
||||||
title=title,
|
title=title,
|
||||||
hover_data=HOVER_DATA_DELTA,
|
hover_data=HOVER_DATA_DELTA,
|
||||||
)
|
)
|
||||||
@ -309,25 +362,77 @@ def fig_timeline_delta(
|
|||||||
return fig
|
return fig
|
||||||
|
|
||||||
|
|
||||||
|
def transform_to_HTML_table(
|
||||||
|
data: DataFrame,
|
||||||
|
target_features: Collection[str],
|
||||||
|
date_cols: Iterable[str] | None = None,
|
||||||
|
sorting_feature: str | None = None,
|
||||||
|
sorting_ascending: bool = True,
|
||||||
|
) -> tuple[HTMLColumns, HTMLTable]:
|
||||||
|
target_features = list(target_features)
|
||||||
|
data = data.copy()
|
||||||
|
data = data.filter(items=target_features, axis=1)
|
||||||
|
|
||||||
|
if sorting_feature is not None:
|
||||||
|
data = data.sort_values(by='ErstellungsDatum', ascending=sorting_ascending)
|
||||||
|
|
||||||
|
if date_cols is not None:
|
||||||
|
for col in date_cols:
|
||||||
|
data[col] = data[col].dt.strftime(r'%Y-%m-%d')
|
||||||
|
|
||||||
|
columns = [{'name': col, 'id': col} for col in data.columns]
|
||||||
|
table_data = data.to_dict('records')
|
||||||
|
|
||||||
|
return columns, table_data
|
||||||
|
|
||||||
|
|
||||||
|
# 'table-best-actions'
|
||||||
# ** HTML table
|
# ** HTML table
|
||||||
@callback(
|
@callback(
|
||||||
[Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
|
[
|
||||||
|
Output('table-candidates', 'columns'),
|
||||||
|
Output('table-candidates', 'data'),
|
||||||
|
Output('table-best-actions', 'columns'),
|
||||||
|
Output('table-best-actions', 'data'),
|
||||||
|
],
|
||||||
Input('selector-candidates', 'value'),
|
Input('selector-candidates', 'value'),
|
||||||
State('selector-obj_id', 'value'),
|
State('selector-obj_id', 'value'),
|
||||||
prevent_initial_call=True,
|
prevent_initial_call=True,
|
||||||
)
|
)
|
||||||
def update_table_candidates(index, obj_id):
|
def update_tables_candidates(
|
||||||
df = pre_filter_data(data, idx=index, obj_id=obj_id)
|
index,
|
||||||
df = df.filter(items=TABLE_FEATS, axis=1).sort_values(
|
obj_id,
|
||||||
by='ErstellungsDatum', ascending=True
|
) -> tuple[HTMLColumns, HTMLTable, HTMLColumns, HTMLTable]:
|
||||||
|
cands = filter_candidates(data, idx=index, obj_id=obj_id)
|
||||||
|
overview_cols, overview_table = transform_to_HTML_table(
|
||||||
|
data=cands,
|
||||||
|
target_features=TABLE_FEATS_OVERVIEW,
|
||||||
|
date_cols=TABLE_FEATS_DATES,
|
||||||
|
sorting_feature='ErstellungsDatum',
|
||||||
|
sorting_ascending=True,
|
||||||
)
|
)
|
||||||
cols = [{'name': i, 'id': i} for i in df.columns]
|
# df = df.filter(items=TABLE_FEATS_OVERVIEW, axis=1).sort_values(
|
||||||
# convert dates to strings
|
# by='ErstellungsDatum', ascending=True
|
||||||
for col in TABLE_FEATS_DATES:
|
# )
|
||||||
df[col] = df[col].dt.strftime(r'%Y-%m-%d')
|
# cols = [{'name': i, 'id': i} for i in df.columns]
|
||||||
|
# # convert dates to strings
|
||||||
|
# for col in TABLE_FEATS_DATES:
|
||||||
|
# df[col] = df[col].dt.strftime(r'%Y-%m-%d')
|
||||||
|
|
||||||
table_data = df.to_dict('records')
|
# table_data = df.to_dict('records')
|
||||||
return table_data, cols
|
|
||||||
|
cands_best_actions = calc_delta_to_next_failure(
|
||||||
|
data=cands,
|
||||||
|
date_feature='ErstellungsDatum',
|
||||||
|
name_delta_feature=NAME_DELTA_FEAT_TO_NEXT_FAILURE,
|
||||||
|
)
|
||||||
|
best_actions_cols, best_actions_table = transform_to_HTML_table(
|
||||||
|
data=cands_best_actions,
|
||||||
|
target_features=TABLE_FEATS_BEST_ACTIONS,
|
||||||
|
date_cols=TABLE_FEATS_DATES,
|
||||||
|
)
|
||||||
|
|
||||||
|
return overview_cols, overview_table, best_actions_cols, best_actions_table
|
||||||
|
|
||||||
|
|
||||||
# ** graph callbacks
|
# ** graph callbacks
|
||||||
@ -345,7 +450,7 @@ def update_table_candidates(index, obj_id):
|
|||||||
def display_candidates_as_graph(index, obj_id):
|
def display_candidates_as_graph(index, obj_id):
|
||||||
error_msg = ''
|
error_msg = ''
|
||||||
t1 = time.perf_counter()
|
t1 = time.perf_counter()
|
||||||
df = pre_filter_data(data, idx=index, obj_id=obj_id)
|
df = filter_candidates(data, idx=index, obj_id=obj_id)
|
||||||
t2 = time.perf_counter()
|
t2 = time.perf_counter()
|
||||||
print(f'Time for filtering: {t2 - t1} s')
|
print(f'Time for filtering: {t2 - t1} s')
|
||||||
|
|
||||||
|
|||||||
@ -1,9 +1,8 @@
|
|||||||
import re
|
from collections.abc import Collection
|
||||||
from collections.abc import Iterable
|
|
||||||
from itertools import combinations
|
from itertools import combinations
|
||||||
from math import factorial
|
from math import factorial
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Callable, cast
|
from typing import cast
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -25,11 +24,12 @@ from lang_main.loggers import logger_preprocess as logger
|
|||||||
from lang_main.pipelines.base import Pipeline
|
from lang_main.pipelines.base import Pipeline
|
||||||
from lang_main.types import Embedding, PandasIndex
|
from lang_main.types import Embedding, PandasIndex
|
||||||
|
|
||||||
# ** RE patterns
|
# TODO removal
|
||||||
pattern_special_chars = re.compile(r'[\t\n\r\f\v]+')
|
# pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
|
||||||
pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
|
# pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
|
||||||
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
|
# pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
|
||||||
pattern_whitespace = re.compile(r'[ ]{2,}')
|
# pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
|
||||||
|
# pattern_whitespace = re.compile(r'[ ]{2,}')
|
||||||
|
|
||||||
|
|
||||||
# ** (1) dataset preparation: loading and simple preprocessing
|
# ** (1) dataset preparation: loading and simple preprocessing
|
||||||
@ -37,7 +37,7 @@ pattern_whitespace = re.compile(r'[ ]{2,}')
|
|||||||
# duplicate cleansing based on all properties
|
# duplicate cleansing based on all properties
|
||||||
def load_raw_data(
|
def load_raw_data(
|
||||||
path: Path,
|
path: Path,
|
||||||
date_cols: Iterable[str] = (
|
date_cols: Collection[str] = (
|
||||||
'VorgangsDatum',
|
'VorgangsDatum',
|
||||||
'ErledigungsDatum',
|
'ErledigungsDatum',
|
||||||
'Arbeitsbeginn',
|
'Arbeitsbeginn',
|
||||||
@ -50,7 +50,7 @@ def load_raw_data(
|
|||||||
----------
|
----------
|
||||||
path : str
|
path : str
|
||||||
path to dataset file, usually CSV file
|
path to dataset file, usually CSV file
|
||||||
date_cols : list[str], optional
|
date_cols : Collection[str], optional
|
||||||
columns which contain dates and are parsed as such,
|
columns which contain dates and are parsed as such,
|
||||||
by default (
|
by default (
|
||||||
'VorgangsDatum',
|
'VorgangsDatum',
|
||||||
@ -129,9 +129,7 @@ def remove_duplicates(
|
|||||||
|
|
||||||
def remove_NA(
|
def remove_NA(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
target_features: list[str] = [
|
target_features: Collection[str] = ('VorgangsBeschreibung',),
|
||||||
'VorgangsBeschreibung',
|
|
||||||
],
|
|
||||||
) -> tuple[DataFrame]:
|
) -> tuple[DataFrame]:
|
||||||
"""function to drop NA entries based on a subset of features to be analysed
|
"""function to drop NA entries based on a subset of features to be analysed
|
||||||
|
|
||||||
@ -139,14 +137,15 @@ def remove_NA(
|
|||||||
----------
|
----------
|
||||||
data : DataFrame
|
data : DataFrame
|
||||||
standard IHM dataset, perhaps pre-cleaned
|
standard IHM dataset, perhaps pre-cleaned
|
||||||
target_features : list[str], optional
|
target_features : Collection[str], optional
|
||||||
subset to analyse to define an NA entry, by default [ 'VorgangsBeschreibung', ]
|
subset to analyse to define an NA entry, by default ('VorgangsBeschreibung',)
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
DataFrame
|
DataFrame
|
||||||
dataset with removed NA entries for given subset of features
|
dataset with removed NA entries for given subset of features
|
||||||
"""
|
"""
|
||||||
|
target_features = list(target_features)
|
||||||
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
|
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
|
||||||
logger.info(
|
logger.info(
|
||||||
f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
|
f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
|
||||||
@ -156,46 +155,7 @@ def remove_NA(
|
|||||||
|
|
||||||
|
|
||||||
# ** (2) entry-based cleansing
|
# ** (2) entry-based cleansing
|
||||||
# following functions clean and prepare specific entries, not whole dataset
|
# ** moved to module ``lang_main.analysis.shared``
|
||||||
def clean_string_slim(string: str) -> str:
|
|
||||||
"""mapping function to clean single string entries in a series (feature-wise)
|
|
||||||
of the dataset, used to be applied element-wise for string features
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
string : str
|
|
||||||
dataset entry feature
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
str
|
|
||||||
cleaned entry
|
|
||||||
"""
|
|
||||||
# remove special chars
|
|
||||||
string = pattern_special_chars.sub(' ', string)
|
|
||||||
string = pattern_repeated_chars.sub(r'\1', string)
|
|
||||||
# string = pattern_dates.sub('', string)
|
|
||||||
# dates are used for context, should not be removed at this stage
|
|
||||||
string = pattern_whitespace.sub(' ', string)
|
|
||||||
# remove whitespaces at the beginning and the end
|
|
||||||
string = string.strip()
|
|
||||||
|
|
||||||
return string
|
|
||||||
|
|
||||||
|
|
||||||
def entry_wise_cleansing(
|
|
||||||
data: DataFrame,
|
|
||||||
target_feature: str,
|
|
||||||
cleansing_func: Callable[[str], str],
|
|
||||||
) -> tuple[DataFrame]:
|
|
||||||
# apply given cleansing function to target feature
|
|
||||||
data[target_feature] = data[target_feature].map(cleansing_func)
|
|
||||||
logger.info(
|
|
||||||
('Successfully applied entry-wise cleansing procedure >>%s<< for feature >>%s<<'),
|
|
||||||
cleansing_func.__name__,
|
|
||||||
target_feature,
|
|
||||||
)
|
|
||||||
return (data,)
|
|
||||||
|
|
||||||
|
|
||||||
# ** in-depth analysis of one feature
|
# ** in-depth analysis of one feature
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
from collections.abc import Iterable, Iterator
|
import re
|
||||||
|
from collections.abc import Callable, Collection, Iterable, Iterator
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
import networkx as nx
|
import networkx as nx
|
||||||
@ -7,14 +8,70 @@ import numpy.typing as npt
|
|||||||
import sentence_transformers
|
import sentence_transformers
|
||||||
import sentence_transformers.util
|
import sentence_transformers.util
|
||||||
from networkx import Graph
|
from networkx import Graph
|
||||||
from pandas import Series
|
from pandas import DataFrame, Series
|
||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
|
|
||||||
from lang_main.analysis.graphs import get_graph_metadata, update_graph
|
from lang_main.analysis.graphs import get_graph_metadata, update_graph
|
||||||
|
from lang_main.loggers import logger_preprocess as logger
|
||||||
from lang_main.types import PandasIndex
|
from lang_main.types import PandasIndex
|
||||||
|
|
||||||
|
# ** RE patterns
|
||||||
|
pattern_escape_newline = re.compile(r'[\n]+')
|
||||||
|
pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
|
||||||
|
pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
|
||||||
|
pattern_repeated_chars = re.compile(r'[,;.:!?\-_+]+(?=[,;.:!?\-_+])')
|
||||||
|
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
|
||||||
|
pattern_whitespace = re.compile(r'[ ]{2,}')
|
||||||
|
|
||||||
|
|
||||||
|
# ** RE applications
|
||||||
|
# following functions clean and prepare specific entries, not whole datasets
|
||||||
|
def clean_string_slim(string: str) -> str:
|
||||||
|
"""mapping function to clean single string entries in a series (feature-wise)
|
||||||
|
of the dataset, used to be applied element-wise for string features
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
string : str
|
||||||
|
dataset entry feature
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
str
|
||||||
|
cleaned entry
|
||||||
|
"""
|
||||||
|
# remove special chars
|
||||||
|
string = pattern_escape_newline.sub('. ', string)
|
||||||
|
string = pattern_escape_seq.sub(' ', string)
|
||||||
|
string = pattern_repeated_chars.sub('', string)
|
||||||
|
# string = pattern_dates.sub('', string)
|
||||||
|
# dates are used for context, should not be removed at this stage
|
||||||
|
string = pattern_whitespace.sub(' ', string)
|
||||||
|
# remove whitespaces at the beginning and the end
|
||||||
|
string = string.strip()
|
||||||
|
|
||||||
|
return string
|
||||||
|
|
||||||
|
|
||||||
|
# ** dataset cleansing
|
||||||
|
def entry_wise_cleansing(
|
||||||
|
data: DataFrame,
|
||||||
|
target_features: Collection[str],
|
||||||
|
cleansing_func: Callable[[str], str] = clean_string_slim,
|
||||||
|
) -> tuple[DataFrame]:
|
||||||
|
# apply given cleansing function to target feature
|
||||||
|
target_features = list(target_features)
|
||||||
|
data[target_features] = data[target_features].map(cleansing_func)
|
||||||
|
logger.info(
|
||||||
|
('Successfully applied entry-wise cleansing procedure >>%s<< for features >>%s<<'),
|
||||||
|
cleansing_func.__name__,
|
||||||
|
target_features,
|
||||||
|
)
|
||||||
|
return (data,)
|
||||||
|
|
||||||
|
|
||||||
|
# ** similarities
|
||||||
def candidates_by_index(
|
def candidates_by_index(
|
||||||
data_model_input: Series,
|
data_model_input: Series,
|
||||||
model: SentenceTransformer,
|
model: SentenceTransformer,
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
from collections.abc import Iterable, Iterator
|
from collections.abc import Collection, Iterable, Iterator
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
from pandas import DataFrame, Series
|
from pandas import DataFrame, Series
|
||||||
@ -7,14 +7,21 @@ from tqdm.auto import tqdm # TODO: check deletion
|
|||||||
|
|
||||||
from lang_main.analysis.shared import (
|
from lang_main.analysis.shared import (
|
||||||
candidates_by_index,
|
candidates_by_index,
|
||||||
|
entry_wise_cleansing,
|
||||||
|
pattern_escape_seq_sentences,
|
||||||
similar_index_connection_graph,
|
similar_index_connection_graph,
|
||||||
similar_index_groups,
|
similar_index_groups,
|
||||||
)
|
)
|
||||||
from lang_main.loggers import logger_timeline as logger
|
from lang_main.loggers import logger_timeline as logger
|
||||||
from lang_main.types import ObjectID, PandasIndex, TimelineCandidates
|
from lang_main.types import (
|
||||||
|
DataFrameTLFiltered,
|
||||||
|
ObjectID,
|
||||||
|
PandasIndex,
|
||||||
|
TimelineCandidates,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def non_relevant_obj_ids(
|
def _non_relevant_obj_ids(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
thresh_unique_feat_per_id: int,
|
thresh_unique_feat_per_id: int,
|
||||||
*,
|
*,
|
||||||
@ -50,9 +57,9 @@ def remove_non_relevant_obj_ids(
|
|||||||
feature_uniqueness: str = 'HObjektText',
|
feature_uniqueness: str = 'HObjektText',
|
||||||
feature_obj_id: str = 'ObjektID',
|
feature_obj_id: str = 'ObjektID',
|
||||||
) -> tuple[DataFrame]:
|
) -> tuple[DataFrame]:
|
||||||
logger.info('Removing non-relevant ObjectIDs from dataset')
|
logger.info('Removing non-relevant ObjectIDs from dataset...')
|
||||||
data = data.copy()
|
data = data.copy()
|
||||||
ids_to_ignore = non_relevant_obj_ids(
|
ids_to_ignore = _non_relevant_obj_ids(
|
||||||
data=data,
|
data=data,
|
||||||
thresh_unique_feat_per_id=thresh_unique_feat_per_id,
|
thresh_unique_feat_per_id=thresh_unique_feat_per_id,
|
||||||
feature_uniqueness=feature_uniqueness,
|
feature_uniqueness=feature_uniqueness,
|
||||||
@ -61,7 +68,43 @@ def remove_non_relevant_obj_ids(
|
|||||||
# only retain entries with ObjectIDs not in IDs to ignore
|
# only retain entries with ObjectIDs not in IDs to ignore
|
||||||
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
|
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
|
||||||
logger.debug('Ignored ObjectIDs: %s', ids_to_ignore)
|
logger.debug('Ignored ObjectIDs: %s', ids_to_ignore)
|
||||||
logger.info('Non-relevant ObjectIDs removed successfully')
|
logger.info('Non-relevant ObjectIDs removed successfully.')
|
||||||
|
|
||||||
|
return (data,)
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_descriptions(
|
||||||
|
data: DataFrame,
|
||||||
|
properties: Collection[str] = (
|
||||||
|
'VorgangsBeschreibung',
|
||||||
|
'ErledigungsBeschreibung',
|
||||||
|
),
|
||||||
|
) -> tuple[DataFrame]:
|
||||||
|
logger.info('Cleaning necessary descriptions...')
|
||||||
|
data = data.copy()
|
||||||
|
features = list(properties)
|
||||||
|
data[features] = data[features].fillna('N.V.')
|
||||||
|
(data,) = entry_wise_cleansing(data, target_features=features)
|
||||||
|
logger.info('Cleansing successful.')
|
||||||
|
|
||||||
|
return (data.copy(),)
|
||||||
|
|
||||||
|
|
||||||
|
def calc_delta_to_repair(
|
||||||
|
data: DataFrame,
|
||||||
|
date_feature_start: str = 'ErstellungsDatum',
|
||||||
|
date_feature_end: str = 'ErledigungsDatum',
|
||||||
|
name_delta_feature: str = 'delta_to_repair',
|
||||||
|
convert_to_days: bool = True,
|
||||||
|
) -> tuple[DataFrame]:
|
||||||
|
logger.info('Calculating time differences between start and end of operations...')
|
||||||
|
data = data.copy()
|
||||||
|
data[name_delta_feature] = data[date_feature_end] - data[date_feature_start]
|
||||||
|
|
||||||
|
if convert_to_days:
|
||||||
|
data[name_delta_feature] = data[name_delta_feature].dt.days
|
||||||
|
|
||||||
|
logger.info('Calculation successful.')
|
||||||
|
|
||||||
return (data,)
|
return (data,)
|
||||||
|
|
||||||
@ -75,7 +118,7 @@ def generate_model_input(
|
|||||||
'VorgangsBeschreibung',
|
'VorgangsBeschreibung',
|
||||||
),
|
),
|
||||||
) -> tuple[DataFrame]:
|
) -> tuple[DataFrame]:
|
||||||
logger.info('Generating concatenation of model input features')
|
logger.info('Generating concatenation of model input features...')
|
||||||
data = data.copy()
|
data = data.copy()
|
||||||
model_input_features = list(model_input_features)
|
model_input_features = list(model_input_features)
|
||||||
input_features = data[model_input_features].fillna('').astype(str)
|
input_features = data[model_input_features].fillna('').astype(str)
|
||||||
@ -83,7 +126,7 @@ def generate_model_input(
|
|||||||
lambda x: ' - '.join(x),
|
lambda x: ' - '.join(x),
|
||||||
axis=1,
|
axis=1,
|
||||||
)
|
)
|
||||||
logger.info('Model input generated successfully')
|
logger.info('Model input generated successfully.')
|
||||||
|
|
||||||
return (data,)
|
return (data,)
|
||||||
|
|
||||||
@ -97,7 +140,7 @@ def filter_activities_per_obj_id(
|
|||||||
) -> tuple[DataFrame, Series]:
|
) -> tuple[DataFrame, Series]:
|
||||||
data = data.copy()
|
data = data.copy()
|
||||||
# filter only relevant activities count occurrences for each ObjectID
|
# filter only relevant activities count occurrences for each ObjectID
|
||||||
logger.info('Filtering activities per ObjectID')
|
logger.info('Filtering activities per ObjectID...')
|
||||||
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
|
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
|
||||||
data_filter_activities = data.loc[filt_rel_activities].copy()
|
data_filter_activities = data.loc[filt_rel_activities].copy()
|
||||||
num_activities_per_obj_id = cast(
|
num_activities_per_obj_id = cast(
|
||||||
@ -113,7 +156,7 @@ def filter_activities_per_obj_id(
|
|||||||
|
|
||||||
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
|
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
|
||||||
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
|
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
|
||||||
logger.info('Activities per ObjectID filtered successfully')
|
logger.info('Activities per ObjectID filtered successfully.')
|
||||||
|
|
||||||
return data_filter_activities, num_activities_per_obj_id
|
return data_filter_activities, num_activities_per_obj_id
|
||||||
|
|
||||||
@ -129,7 +172,7 @@ def filter_activities_per_obj_id(
|
|||||||
## use idx pairs to get idx values of series
|
## use idx pairs to get idx values of series
|
||||||
|
|
||||||
|
|
||||||
def get_timeline_candidates_index(
|
def _get_timeline_candidates_index(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
num_activities_per_obj_id: Series,
|
num_activities_per_obj_id: Series,
|
||||||
*,
|
*,
|
||||||
@ -161,7 +204,7 @@ def get_timeline_candidates_index(
|
|||||||
|
|
||||||
|
|
||||||
# TODO: check application for duplicate removal
|
# TODO: check application for duplicate removal
|
||||||
def transform_timeline_candidates(
|
def _transform_timeline_candidates(
|
||||||
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
|
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
|
||||||
) -> TimelineCandidates:
|
) -> TimelineCandidates:
|
||||||
"""function to build a mapping of ObjectIDs to their respective collection of
|
"""function to build a mapping of ObjectIDs to their respective collection of
|
||||||
@ -200,7 +243,7 @@ def transform_timeline_candidates(
|
|||||||
return candidates_by_obj_id
|
return candidates_by_obj_id
|
||||||
|
|
||||||
|
|
||||||
def map_obj_id_to_texts(
|
def _map_obj_id_to_texts(
|
||||||
data: DataFrame,
|
data: DataFrame,
|
||||||
feature_obj_id: str = 'ObjektID',
|
feature_obj_id: str = 'ObjektID',
|
||||||
) -> dict[ObjectID, str]:
|
) -> dict[ObjectID, str]:
|
||||||
@ -229,7 +272,7 @@ def get_timeline_candidates(
|
|||||||
model_input_feature: str = 'nlp_model_input',
|
model_input_feature: str = 'nlp_model_input',
|
||||||
) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
|
) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
|
||||||
logger.info('Obtaining timeline candidates...')
|
logger.info('Obtaining timeline candidates...')
|
||||||
candidates = get_timeline_candidates_index(
|
candidates = _get_timeline_candidates_index(
|
||||||
data=data,
|
data=data,
|
||||||
num_activities_per_obj_id=num_activities_per_obj_id,
|
num_activities_per_obj_id=num_activities_per_obj_id,
|
||||||
model=model,
|
model=model,
|
||||||
@ -237,14 +280,52 @@ def get_timeline_candidates(
|
|||||||
feature_obj_id=feature_obj_id,
|
feature_obj_id=feature_obj_id,
|
||||||
model_input_feature=model_input_feature,
|
model_input_feature=model_input_feature,
|
||||||
)
|
)
|
||||||
tl_candidates = transform_timeline_candidates(candidates)
|
tl_candidates = _transform_timeline_candidates(candidates)
|
||||||
logger.info('Timeline candidates obtained successfully.')
|
logger.info('Timeline candidates obtained successfully.')
|
||||||
# text mapping to obtain object descriptors
|
# text mapping to obtain object descriptors
|
||||||
logger.info('Mapping ObjectIDs to their respective text descriptor...')
|
logger.info('Mapping ObjectIDs to their respective text descriptor...')
|
||||||
map_obj_text = map_obj_id_to_texts(
|
map_obj_text = _map_obj_id_to_texts(
|
||||||
data=data,
|
data=data,
|
||||||
feature_obj_id=feature_obj_id,
|
feature_obj_id=feature_obj_id,
|
||||||
)
|
)
|
||||||
logger.info('ObjectIDs successfully mapped to text descriptors.')
|
logger.info('ObjectIDs successfully mapped to text descriptors.')
|
||||||
|
|
||||||
return tl_candidates, map_obj_text
|
return tl_candidates, map_obj_text
|
||||||
|
|
||||||
|
|
||||||
|
# ** Postprocessing
|
||||||
|
# filter original dataset for a batch of timeline candidates
|
||||||
|
def filter_timeline_cands(
|
||||||
|
data: DataFrame,
|
||||||
|
cands: TimelineCandidates,
|
||||||
|
obj_id: ObjectID,
|
||||||
|
entry_idx: int,
|
||||||
|
sort_feature: str = 'ErstellungsDatum',
|
||||||
|
) -> DataFrameTLFiltered:
|
||||||
|
data = data.copy()
|
||||||
|
cands_for_obj_id = cands[obj_id]
|
||||||
|
cands_choice = cands_for_obj_id[entry_idx]
|
||||||
|
data = data.loc[list(cands_choice)].sort_values(
|
||||||
|
by=sort_feature,
|
||||||
|
ascending=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def calc_delta_to_next_failure(
|
||||||
|
data: DataFrameTLFiltered,
|
||||||
|
date_feature: str = 'ErstellungsDatum',
|
||||||
|
name_delta_feature: str = 'delta_to_next_failure',
|
||||||
|
convert_to_days: bool = True,
|
||||||
|
) -> DataFrameTLFiltered:
|
||||||
|
data = data.copy()
|
||||||
|
last_val = data[date_feature].iat[-1]
|
||||||
|
shifted = data[date_feature].shift(-1, fill_value=last_val)
|
||||||
|
data[name_delta_feature] = shifted - data[date_feature]
|
||||||
|
data = data.sort_values(by=name_delta_feature, ascending=False)
|
||||||
|
|
||||||
|
if convert_to_days:
|
||||||
|
data[name_delta_feature] = data[name_delta_feature].dt.days
|
||||||
|
|
||||||
|
return data
|
||||||
|
|||||||
@ -76,13 +76,14 @@ CYTO_LAYOUT_PROPERTIES: Final[CytoLayoutProperties] = {
|
|||||||
'isDeterministic': True,
|
'isDeterministic': True,
|
||||||
'singlePartition': False,
|
'singlePartition': False,
|
||||||
}
|
}
|
||||||
|
CYTO_SANDBOX_NAME: Final[str] = 'lang_main'
|
||||||
CYTO_STYLESHEET_NAME: Final[str] = 'lang_main'
|
CYTO_STYLESHEET_NAME: Final[str] = 'lang_main'
|
||||||
# name for property, on which selection is done
|
# name for property, on which selection is done
|
||||||
CYTO_SELECTION_PROPERTY: Final[str] = 'node_selection'
|
CYTO_SELECTION_PROPERTY: Final[str] = 'node_selection'
|
||||||
CYTO_NUMBER_SUBGRAPHS: Final[int] = 5
|
CYTO_NUMBER_SUBGRAPHS: Final[int] = 5
|
||||||
CYTO_ITER_NEIGHBOUR_DEPTH: Final[int] = 2
|
CYTO_ITER_NEIGHBOUR_DEPTH: Final[int] = 2
|
||||||
|
|
||||||
# ** time analysis.uniqueness
|
# ** time_analysis.uniqueness
|
||||||
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
|
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
|
||||||
'threshold_unique_texts'
|
'threshold_unique_texts'
|
||||||
]
|
]
|
||||||
@ -90,6 +91,10 @@ UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
|
|||||||
'criterion_feature'
|
'criterion_feature'
|
||||||
]
|
]
|
||||||
FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
|
FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
|
||||||
|
# ** time_analysis.preparation
|
||||||
|
NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'delta_to_repair'
|
||||||
|
# NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'Zeitspanne bis zur Behebung [Tage]'
|
||||||
|
NAME_DELTA_FEAT_TO_NEXT_FAILURE: Final[str] = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
|
||||||
# ** time_analysis.model_input
|
# ** time_analysis.model_input
|
||||||
MODEL_INPUT_FEATURES: Final[tuple[str, ...]] = tuple(
|
MODEL_INPUT_FEATURES: Final[tuple[str, ...]] = tuple(
|
||||||
CONFIG['time_analysis']['model_input']['input_features']
|
CONFIG['time_analysis']['model_input']['input_features']
|
||||||
|
|||||||
Binary file not shown.
@ -3,15 +3,19 @@ from pathlib import Path
|
|||||||
from lang_main.analysis import graphs
|
from lang_main.analysis import graphs
|
||||||
from lang_main.analysis.preprocessing import (
|
from lang_main.analysis.preprocessing import (
|
||||||
analyse_feature,
|
analyse_feature,
|
||||||
clean_string_slim,
|
|
||||||
entry_wise_cleansing,
|
|
||||||
load_raw_data,
|
load_raw_data,
|
||||||
merge_similarity_dupl,
|
merge_similarity_dupl,
|
||||||
numeric_pre_filter_feature,
|
numeric_pre_filter_feature,
|
||||||
remove_duplicates,
|
remove_duplicates,
|
||||||
remove_NA,
|
remove_NA,
|
||||||
)
|
)
|
||||||
|
from lang_main.analysis.shared import (
|
||||||
|
clean_string_slim,
|
||||||
|
entry_wise_cleansing,
|
||||||
|
)
|
||||||
from lang_main.analysis.timeline import (
|
from lang_main.analysis.timeline import (
|
||||||
|
calc_delta_to_repair,
|
||||||
|
cleanup_descriptions,
|
||||||
filter_activities_per_obj_id,
|
filter_activities_per_obj_id,
|
||||||
generate_model_input,
|
generate_model_input,
|
||||||
get_timeline_candidates,
|
get_timeline_candidates,
|
||||||
@ -25,6 +29,7 @@ from lang_main.constants import (
|
|||||||
DATE_COLS,
|
DATE_COLS,
|
||||||
FEATURE_NAME_OBJ_ID,
|
FEATURE_NAME_OBJ_ID,
|
||||||
MODEL_INPUT_FEATURES,
|
MODEL_INPUT_FEATURES,
|
||||||
|
NAME_DELTA_FEAT_TO_REPAIR,
|
||||||
SAVE_PATH_FOLDER,
|
SAVE_PATH_FOLDER,
|
||||||
SPCY_MODEL,
|
SPCY_MODEL,
|
||||||
STFR_MODEL,
|
STFR_MODEL,
|
||||||
@ -56,7 +61,7 @@ def build_base_target_feature_pipe() -> Pipeline:
|
|||||||
pipe_target_feat.add(
|
pipe_target_feat.add(
|
||||||
entry_wise_cleansing,
|
entry_wise_cleansing,
|
||||||
{
|
{
|
||||||
'target_feature': 'VorgangsBeschreibung',
|
'target_feature': ('VorgangsBeschreibung',),
|
||||||
'cleansing_func': clean_string_slim,
|
'cleansing_func': clean_string_slim,
|
||||||
},
|
},
|
||||||
save_result=True,
|
save_result=True,
|
||||||
@ -182,7 +187,6 @@ def build_tk_graph_rescaling_pipe(
|
|||||||
graphs.pipe_add_graph_metrics,
|
graphs.pipe_add_graph_metrics,
|
||||||
save_result=save_result,
|
save_result=save_result,
|
||||||
filename=exit_point,
|
filename=exit_point,
|
||||||
# filename=EntryPoints.TK_GRAPH_ANALYSIS_RESCALED,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return pipe_graph_rescaling
|
return pipe_graph_rescaling
|
||||||
@ -247,6 +251,23 @@ def build_tk_graph_render_pipe(
|
|||||||
# ** timeline analysis
|
# ** timeline analysis
|
||||||
def build_timeline_pipe() -> Pipeline:
|
def build_timeline_pipe() -> Pipeline:
|
||||||
pipe_timeline = Pipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
|
pipe_timeline = Pipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
|
||||||
|
pipe_timeline.add(
|
||||||
|
cleanup_descriptions,
|
||||||
|
{
|
||||||
|
'properties': ['ErledigungsBeschreibung'],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
pipe_timeline.add(
|
||||||
|
calc_delta_to_repair,
|
||||||
|
{
|
||||||
|
'date_feature_start': 'ErstellungsDatum',
|
||||||
|
'date_feature_end': 'ErledigungsDatum',
|
||||||
|
'name_delta_feature': NAME_DELTA_FEAT_TO_REPAIR,
|
||||||
|
'convert_to_days': True,
|
||||||
|
},
|
||||||
|
save_result=True,
|
||||||
|
filename=EntryPoints.TIMELINE_POST,
|
||||||
|
)
|
||||||
pipe_timeline.add(
|
pipe_timeline.add(
|
||||||
remove_non_relevant_obj_ids,
|
remove_non_relevant_obj_ids,
|
||||||
{
|
{
|
||||||
@ -281,7 +302,7 @@ def build_timeline_pipe() -> Pipeline:
|
|||||||
'model_input_feature': 'nlp_model_input',
|
'model_input_feature': 'nlp_model_input',
|
||||||
},
|
},
|
||||||
save_result=True,
|
save_result=True,
|
||||||
filename=EntryPoints.TIMELINE_POST,
|
filename=EntryPoints.TIMELINE_CANDS,
|
||||||
)
|
)
|
||||||
|
|
||||||
return pipe_timeline
|
return pipe_timeline
|
||||||
|
|||||||
@ -16,6 +16,7 @@ from lang_main.constants import (
|
|||||||
CYTO_LAYOUT_PROPERTIES,
|
CYTO_LAYOUT_PROPERTIES,
|
||||||
CYTO_NUMBER_SUBGRAPHS,
|
CYTO_NUMBER_SUBGRAPHS,
|
||||||
CYTO_PATH_STYLESHEET,
|
CYTO_PATH_STYLESHEET,
|
||||||
|
CYTO_SANDBOX_NAME,
|
||||||
CYTO_SELECTION_PROPERTY,
|
CYTO_SELECTION_PROPERTY,
|
||||||
CYTO_STYLESHEET_NAME,
|
CYTO_STYLESHEET_NAME,
|
||||||
PROPERTY_NAME_DEGREE_WEIGHTED,
|
PROPERTY_NAME_DEGREE_WEIGHTED,
|
||||||
@ -56,6 +57,8 @@ def verify_connection():
|
|||||||
def import_to_cytoscape(
|
def import_to_cytoscape(
|
||||||
graph: DiGraph | Graph,
|
graph: DiGraph | Graph,
|
||||||
network_name: str = CYTO_BASE_NETWORK_NAME,
|
network_name: str = CYTO_BASE_NETWORK_NAME,
|
||||||
|
sandbox_name: str = CYTO_SANDBOX_NAME,
|
||||||
|
reinitialise_sandbox: bool = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Cytoscape: import NetworkX graph as new network collection
|
"""Cytoscape: import NetworkX graph as new network collection
|
||||||
|
|
||||||
@ -66,6 +69,12 @@ def import_to_cytoscape(
|
|||||||
"""
|
"""
|
||||||
logger.debug('Checking Cytoscape connection...')
|
logger.debug('Checking Cytoscape connection...')
|
||||||
verify_connection()
|
verify_connection()
|
||||||
|
logger.debug('Setting Cytoscape sandbox...')
|
||||||
|
p4c.sandbox_set(
|
||||||
|
sandbox_name=sandbox_name,
|
||||||
|
reinitialize=reinitialise_sandbox,
|
||||||
|
copy_samples=False,
|
||||||
|
)
|
||||||
logger.debug('Importing to and analysing network in Cytoscape...')
|
logger.debug('Importing to and analysing network in Cytoscape...')
|
||||||
p4c.delete_all_networks()
|
p4c.delete_all_networks()
|
||||||
p4c.create_network_from_networkx(
|
p4c.create_network_from_networkx(
|
||||||
@ -122,6 +131,7 @@ def export_network_to_image(
|
|||||||
filetype: CytoExportFileTypes = 'SVG',
|
filetype: CytoExportFileTypes = 'SVG',
|
||||||
network_name: str = CYTO_BASE_NETWORK_NAME,
|
network_name: str = CYTO_BASE_NETWORK_NAME,
|
||||||
pdf_export_page_size: CytoExportPageSizes = 'A4',
|
pdf_export_page_size: CytoExportPageSizes = 'A4',
|
||||||
|
sandbox_name: str = CYTO_SANDBOX_NAME,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Cytoscape: export current selected view as image
|
"""Cytoscape: export current selected view as image
|
||||||
|
|
||||||
@ -140,14 +150,17 @@ def export_network_to_image(
|
|||||||
logger.debug('Exporting image to file...')
|
logger.debug('Exporting image to file...')
|
||||||
if not target_folder.exists():
|
if not target_folder.exists():
|
||||||
target_folder.mkdir(parents=True)
|
target_folder.mkdir(parents=True)
|
||||||
file_pth = target_folder / filename
|
dst_file_pth = (target_folder / filename).with_suffix(f'.{filetype.lower()}')
|
||||||
|
|
||||||
text_as_font = True
|
text_as_font = True
|
||||||
if filetype == 'SVG':
|
if filetype == 'SVG':
|
||||||
text_as_font = False
|
text_as_font = False
|
||||||
|
|
||||||
|
# image is generated in sandbox directory and transferred to target destination
|
||||||
|
# (preparation for remote instances of Cytoscape)
|
||||||
|
# TODO close non-necessary windows before image display
|
||||||
p4c.export_image(
|
p4c.export_image(
|
||||||
filename=str(file_pth),
|
filename=filename,
|
||||||
type=filetype,
|
type=filetype,
|
||||||
network=network_name,
|
network=network_name,
|
||||||
overwrite_file=True,
|
overwrite_file=True,
|
||||||
@ -155,7 +168,24 @@ def export_network_to_image(
|
|||||||
export_text_as_font=text_as_font,
|
export_text_as_font=text_as_font,
|
||||||
page_size=pdf_export_page_size,
|
page_size=pdf_export_page_size,
|
||||||
)
|
)
|
||||||
logger.debug('Exporting image to file successful.')
|
# TODO change back to Cytoscape 3.10 and above
|
||||||
|
# TODO remove if Cytoscape >= 3.10.* is running in container
|
||||||
|
# p4c.export_image(
|
||||||
|
# filename=filename,
|
||||||
|
# type=filetype,
|
||||||
|
# network=network_name,
|
||||||
|
# overwrite_file=True,
|
||||||
|
# )
|
||||||
|
logger.debug('Exported image to sandbox.')
|
||||||
|
logger.debug('Transferring image from sandbox to target destination...')
|
||||||
|
sandbox_filename = f'{filename}.{filetype.lower()}'
|
||||||
|
p4c.sandbox_get_from(
|
||||||
|
source_file=sandbox_filename,
|
||||||
|
dest_file=str(dst_file_pth),
|
||||||
|
overwrite=True,
|
||||||
|
sandbox_name=sandbox_name,
|
||||||
|
)
|
||||||
|
logger.debug('Transfer of image from sandbox to target destination successful.')
|
||||||
|
|
||||||
|
|
||||||
def layout_network(
|
def layout_network(
|
||||||
@ -192,6 +222,7 @@ def apply_style_to_network(
|
|||||||
node_size_property: str = 'node_selection',
|
node_size_property: str = 'node_selection',
|
||||||
min_node_size: int = 15,
|
min_node_size: int = 15,
|
||||||
max_node_size: int = 40,
|
max_node_size: int = 40,
|
||||||
|
sandbox_name: str = CYTO_SANDBOX_NAME,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Cytoscape: apply a chosen Cytoscape style to the defined network
|
"""Cytoscape: apply a chosen Cytoscape style to the defined network
|
||||||
|
|
||||||
@ -221,7 +252,16 @@ def apply_style_to_network(
|
|||||||
raise FileNotFoundError(
|
raise FileNotFoundError(
|
||||||
f'Visual stylesheet for Cytoscape not found under: >>{pth_to_stylesheet}<<'
|
f'Visual stylesheet for Cytoscape not found under: >>{pth_to_stylesheet}<<'
|
||||||
)
|
)
|
||||||
p4c.import_visual_styles(str(pth_to_stylesheet))
|
# send to sandbox
|
||||||
|
sandbox_filename = pth_to_stylesheet.name
|
||||||
|
p4c.sandbox_send_to(
|
||||||
|
source_file=pth_to_stylesheet,
|
||||||
|
dest_file=sandbox_filename,
|
||||||
|
overwrite=True,
|
||||||
|
sandbox_name=sandbox_name,
|
||||||
|
)
|
||||||
|
# load stylesheet
|
||||||
|
p4c.import_visual_styles(sandbox_filename)
|
||||||
|
|
||||||
p4c.set_visual_style(style_name, network=network_name)
|
p4c.set_visual_style(style_name, network=network_name)
|
||||||
# node size mapping, only if needed property is available
|
# node size mapping, only if needed property is available
|
||||||
@ -242,6 +282,7 @@ def apply_style_to_network(
|
|||||||
default_number=min_node_size,
|
default_number=min_node_size,
|
||||||
)
|
)
|
||||||
p4c.set_node_size_mapping(**node_size_map)
|
p4c.set_node_size_mapping(**node_size_map)
|
||||||
|
# TODO removal
|
||||||
# else:
|
# else:
|
||||||
# node_table = p4c.get_table_columns(table='node', network=network_name)
|
# node_table = p4c.get_table_columns(table='node', network=network_name)
|
||||||
# nodes_SUID = node_table['SUID'].to_list()
|
# nodes_SUID = node_table['SUID'].to_list()
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
import enum
|
import enum
|
||||||
|
from collections.abc import Hashable
|
||||||
from typing import (
|
from typing import (
|
||||||
|
Any,
|
||||||
Literal,
|
Literal,
|
||||||
Required,
|
Required,
|
||||||
TypeAlias,
|
TypeAlias,
|
||||||
@ -7,6 +9,7 @@ from typing import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from pandas import DataFrame
|
||||||
from spacy.tokens.doc import Doc as SpacyDoc
|
from spacy.tokens.doc import Doc as SpacyDoc
|
||||||
from torch import Tensor
|
from torch import Tensor
|
||||||
|
|
||||||
@ -33,6 +36,7 @@ ResultHandling: TypeAlias = list[tuple[bool, str | None]]
|
|||||||
class EntryPoints(enum.StrEnum):
|
class EntryPoints(enum.StrEnum):
|
||||||
TIMELINE = 'TIMELINE'
|
TIMELINE = 'TIMELINE'
|
||||||
TIMELINE_POST = 'TIMELINE_POSTPROCESSING'
|
TIMELINE_POST = 'TIMELINE_POSTPROCESSING'
|
||||||
|
TIMELINE_CANDS = 'TIMELINE_CANDIDATES'
|
||||||
TIMELINE_TK_GRAPH_RESCALED = 'TIMELINE_TK_GRAPH_RESCALED'
|
TIMELINE_TK_GRAPH_RESCALED = 'TIMELINE_TK_GRAPH_RESCALED'
|
||||||
TK_GRAPH_POST = 'TK-GRAPH_POSTPROCESSING'
|
TK_GRAPH_POST = 'TK-GRAPH_POSTPROCESSING'
|
||||||
TK_GRAPH_ANALYSIS = 'TK-GRAPH_ANALYSIS'
|
TK_GRAPH_ANALYSIS = 'TK-GRAPH_ANALYSIS'
|
||||||
@ -44,6 +48,7 @@ class EntryPoints(enum.StrEnum):
|
|||||||
PandasIndex: TypeAlias = int | np.int64
|
PandasIndex: TypeAlias = int | np.int64
|
||||||
ObjectID: TypeAlias = int
|
ObjectID: TypeAlias = int
|
||||||
Embedding: TypeAlias = SpacyDoc | Tensor
|
Embedding: TypeAlias = SpacyDoc | Tensor
|
||||||
|
DataFrameTLFiltered: TypeAlias = DataFrame
|
||||||
|
|
||||||
# ** graphs
|
# ** graphs
|
||||||
NodeTitle: TypeAlias = str
|
NodeTitle: TypeAlias = str
|
||||||
@ -118,3 +123,8 @@ class CytoscapeData(TypedDict, total=False):
|
|||||||
|
|
||||||
# ** timeline
|
# ** timeline
|
||||||
TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
|
TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
|
||||||
|
|
||||||
|
|
||||||
|
# ** Dash (Dashboard)
|
||||||
|
HTMLTable: TypeAlias = list[dict[Hashable, Any]]
|
||||||
|
HTMLColumns: TypeAlias = list[dict[str, str]]
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user