sandboxing
This commit is contained in:
parent
9197146d2c
commit
3f58a14852
@ -1,9 +1,11 @@
|
||||
import time
|
||||
import webbrowser
|
||||
from pathlib import Path
|
||||
from collections.abc import Collection, Iterable
|
||||
from threading import Thread
|
||||
from typing import Any, Final, cast
|
||||
|
||||
import pandas as pd
|
||||
|
||||
# import dash_cytoscape as cyto
|
||||
import plotly.express as px
|
||||
from dash import (
|
||||
@ -21,20 +23,37 @@ from plotly.graph_objects import Figure
|
||||
|
||||
import lang_main.io
|
||||
from lang_main.analysis import graphs, tokens
|
||||
from lang_main.constants import SAVE_PATH_FOLDER, SPCY_MODEL
|
||||
from lang_main.analysis.timeline import (
|
||||
calc_delta_to_next_failure,
|
||||
filter_timeline_cands,
|
||||
)
|
||||
from lang_main.constants import (
|
||||
NAME_DELTA_FEAT_TO_NEXT_FAILURE,
|
||||
NAME_DELTA_FEAT_TO_REPAIR,
|
||||
SAVE_PATH_FOLDER,
|
||||
SPCY_MODEL,
|
||||
)
|
||||
from lang_main.errors import EmptyEdgesError, EmptyGraphError
|
||||
from lang_main.pipelines.predefined import (
|
||||
build_tk_graph_render_pipe,
|
||||
build_tk_graph_rescaling_pipe,
|
||||
)
|
||||
from lang_main.types import EntryPoints, ObjectID, TimelineCandidates
|
||||
from lang_main.types import (
|
||||
DataFrameTLFiltered,
|
||||
EntryPoints,
|
||||
HTMLColumns,
|
||||
HTMLTable,
|
||||
ObjectID,
|
||||
TimelineCandidates,
|
||||
)
|
||||
|
||||
# ** data
|
||||
# p_df = Path(r'../results/test_20240619/TIMELINE.pkl').resolve()
|
||||
p_df = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE)
|
||||
p_df = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_POST)
|
||||
(data,) = cast(tuple[DataFrame], lang_main.io.load_pickle(p_df))
|
||||
# data = cleanup_descriptions(data, properties=['ErledigungsBeschreibung'])
|
||||
# p_tl = Path(r'../results/test_20240619/TIMELINE_POSTPROCESSING.pkl').resolve()
|
||||
p_tl = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_POST)
|
||||
p_tl = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_CANDS)
|
||||
cands, texts = cast(
|
||||
tuple[TimelineCandidates, dict[ObjectID, str]], lang_main.io.load_pickle(p_tl)
|
||||
)
|
||||
@ -56,17 +75,27 @@ PTH_RENDERED_GRAPH = lang_main.io.get_entry_point(
|
||||
file_ext='.svg',
|
||||
)
|
||||
|
||||
|
||||
TABLE_FEATS: Final[list[str]] = [
|
||||
# NAME_DELTA_FEAT_TO_NEXT_FAILURE: Final[str] = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
|
||||
TABLE_FEATS_OVERVIEW: Final[list[str]] = [
|
||||
'ErstellungsDatum',
|
||||
'ErledigungsDatum',
|
||||
NAME_DELTA_FEAT_TO_REPAIR,
|
||||
'VorgangsTypName',
|
||||
'VorgangsBeschreibung',
|
||||
'ErledigungsBeschreibung',
|
||||
]
|
||||
TABLE_FEATS_DATES: Final[list[str]] = [
|
||||
'ErstellungsDatum',
|
||||
'ErledigungsDatum',
|
||||
]
|
||||
TABLE_FEATS_BEST_ACTIONS: Final[list[str]] = [
|
||||
'ErstellungsDatum',
|
||||
'ErledigungsDatum',
|
||||
'VorgangsTypName',
|
||||
'VorgangsBeschreibung',
|
||||
'ErledigungsBeschreibung',
|
||||
NAME_DELTA_FEAT_TO_NEXT_FAILURE,
|
||||
]
|
||||
|
||||
# ** figure config
|
||||
MARKERS_OCCURRENCES: Final[dict[str, Any]] = {
|
||||
@ -86,13 +115,15 @@ HOVER_DATA: Final[dict[str, Any]] = {
|
||||
'ErstellungsDatum': '|%d.%m.%Y',
|
||||
'ErledigungsDatum': '|%d.%m.%Y',
|
||||
'VorgangsBeschreibung': True,
|
||||
'ErledigungsBeschreibung': True,
|
||||
}
|
||||
HOVER_DATA_DELTA: Final[dict[str, Any]] = {
|
||||
'ErstellungsDatum': '|%d.%m.%Y',
|
||||
'ErledigungsDatum': '|%d.%m.%Y',
|
||||
'VorgangsDatum': '|%d.%m.%Y',
|
||||
'delta': True,
|
||||
NAME_DELTA_FEAT_TO_REPAIR: True,
|
||||
'VorgangsBeschreibung': True,
|
||||
'ErledigungsBeschreibung': True,
|
||||
}
|
||||
|
||||
# ** graph
|
||||
@ -136,10 +167,10 @@ graph_layout = html.Div(
|
||||
html.Img(
|
||||
id='static-graph-img',
|
||||
alt='static rendered graph',
|
||||
# style={
|
||||
# 'width': 'auto',
|
||||
# 'height': 'auto',
|
||||
# },
|
||||
style={
|
||||
'width': 'auto',
|
||||
'height': 'auto',
|
||||
},
|
||||
),
|
||||
html.P(id='info-graph-errors', children=[]),
|
||||
],
|
||||
@ -186,7 +217,27 @@ app.layout = html.Div(
|
||||
]
|
||||
),
|
||||
html.Div(
|
||||
[dash_table.DataTable(id='table-candidates')], style={'marginBottom': '2em'}
|
||||
children=[
|
||||
html.Div(
|
||||
[
|
||||
html.H5('Überblick ähnlicher Vorgänge'),
|
||||
dash_table.DataTable(id='table-candidates'),
|
||||
],
|
||||
style={'paddingBottom': '1em'},
|
||||
),
|
||||
html.Div(
|
||||
[
|
||||
html.H5(
|
||||
(
|
||||
'Maßnahmen sortiert nach längstem Zeitraum '
|
||||
'bis zum nächsten Ereignis'
|
||||
)
|
||||
),
|
||||
dash_table.DataTable(id='table-best-actions'),
|
||||
]
|
||||
),
|
||||
],
|
||||
style={'marginBottom': '2em', 'padding': '2em'},
|
||||
),
|
||||
graph_layout,
|
||||
],
|
||||
@ -222,20 +273,21 @@ def update_choice_candidates(obj_id):
|
||||
|
||||
|
||||
# ** helpers to filter DataFrame
|
||||
def pre_filter_data(
|
||||
def filter_candidates(
|
||||
data: DataFrame,
|
||||
idx: int,
|
||||
obj_id: ObjectID,
|
||||
) -> DataFrame:
|
||||
) -> DataFrameTLFiltered:
|
||||
# assert correct data type because of Dash
|
||||
idx = int(idx)
|
||||
obj_id = int(obj_id)
|
||||
# data = data.copy()
|
||||
cands_for_obj_id = cands[obj_id]
|
||||
cands_choice = cands_for_obj_id[int(idx) - 1]
|
||||
# data
|
||||
data = data.loc[list(cands_choice)].sort_index() # type: ignore
|
||||
data['delta'] = data['ErledigungsDatum'] - data['ErstellungsDatum']
|
||||
data['delta'] = data['delta'].dt.days
|
||||
|
||||
data = filter_timeline_cands(
|
||||
data=data,
|
||||
cands=cands,
|
||||
obj_id=obj_id,
|
||||
entry_idx=(idx - 1), # idx in Dashboard starts with 1
|
||||
)
|
||||
|
||||
return data
|
||||
|
||||
@ -258,10 +310,10 @@ def update_timeline(index, obj_id):
|
||||
obj_text = texts[obj_id]
|
||||
title_occurrences = f'HObjektText: {obj_text}'
|
||||
title_delta = f'HObjektText: {obj_text}, Differenz Erstellung und Erledigung'
|
||||
df = pre_filter_data(data, idx=index, obj_id=obj_id)
|
||||
df = filter_candidates(data, idx=index, obj_id=obj_id)
|
||||
# figure
|
||||
fig_occurrences = fig_timeline_occurrences(df, title_occurrences)
|
||||
fig_delta = fig_timeline_delta(df, title_delta)
|
||||
fig_delta = fig_timeline_delta(df, title_delta, delta_feature=NAME_DELTA_FEAT_TO_REPAIR)
|
||||
|
||||
return fig_occurrences, fig_delta
|
||||
|
||||
@ -293,11 +345,12 @@ def fig_timeline_occurrences(
|
||||
def fig_timeline_delta(
|
||||
df: DataFrame,
|
||||
title: str,
|
||||
delta_feature: str,
|
||||
) -> Figure:
|
||||
fig = px.scatter(
|
||||
data_frame=df,
|
||||
x='ErstellungsDatum',
|
||||
y='delta',
|
||||
y=delta_feature,
|
||||
title=title,
|
||||
hover_data=HOVER_DATA_DELTA,
|
||||
)
|
||||
@ -309,25 +362,77 @@ def fig_timeline_delta(
|
||||
return fig
|
||||
|
||||
|
||||
def transform_to_HTML_table(
|
||||
data: DataFrame,
|
||||
target_features: Collection[str],
|
||||
date_cols: Iterable[str] | None = None,
|
||||
sorting_feature: str | None = None,
|
||||
sorting_ascending: bool = True,
|
||||
) -> tuple[HTMLColumns, HTMLTable]:
|
||||
target_features = list(target_features)
|
||||
data = data.copy()
|
||||
data = data.filter(items=target_features, axis=1)
|
||||
|
||||
if sorting_feature is not None:
|
||||
data = data.sort_values(by='ErstellungsDatum', ascending=sorting_ascending)
|
||||
|
||||
if date_cols is not None:
|
||||
for col in date_cols:
|
||||
data[col] = data[col].dt.strftime(r'%Y-%m-%d')
|
||||
|
||||
columns = [{'name': col, 'id': col} for col in data.columns]
|
||||
table_data = data.to_dict('records')
|
||||
|
||||
return columns, table_data
|
||||
|
||||
|
||||
# 'table-best-actions'
|
||||
# ** HTML table
|
||||
@callback(
|
||||
[Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
|
||||
[
|
||||
Output('table-candidates', 'columns'),
|
||||
Output('table-candidates', 'data'),
|
||||
Output('table-best-actions', 'columns'),
|
||||
Output('table-best-actions', 'data'),
|
||||
],
|
||||
Input('selector-candidates', 'value'),
|
||||
State('selector-obj_id', 'value'),
|
||||
prevent_initial_call=True,
|
||||
)
|
||||
def update_table_candidates(index, obj_id):
|
||||
df = pre_filter_data(data, idx=index, obj_id=obj_id)
|
||||
df = df.filter(items=TABLE_FEATS, axis=1).sort_values(
|
||||
by='ErstellungsDatum', ascending=True
|
||||
def update_tables_candidates(
|
||||
index,
|
||||
obj_id,
|
||||
) -> tuple[HTMLColumns, HTMLTable, HTMLColumns, HTMLTable]:
|
||||
cands = filter_candidates(data, idx=index, obj_id=obj_id)
|
||||
overview_cols, overview_table = transform_to_HTML_table(
|
||||
data=cands,
|
||||
target_features=TABLE_FEATS_OVERVIEW,
|
||||
date_cols=TABLE_FEATS_DATES,
|
||||
sorting_feature='ErstellungsDatum',
|
||||
sorting_ascending=True,
|
||||
)
|
||||
cols = [{'name': i, 'id': i} for i in df.columns]
|
||||
# convert dates to strings
|
||||
for col in TABLE_FEATS_DATES:
|
||||
df[col] = df[col].dt.strftime(r'%Y-%m-%d')
|
||||
# df = df.filter(items=TABLE_FEATS_OVERVIEW, axis=1).sort_values(
|
||||
# by='ErstellungsDatum', ascending=True
|
||||
# )
|
||||
# cols = [{'name': i, 'id': i} for i in df.columns]
|
||||
# # convert dates to strings
|
||||
# for col in TABLE_FEATS_DATES:
|
||||
# df[col] = df[col].dt.strftime(r'%Y-%m-%d')
|
||||
|
||||
table_data = df.to_dict('records')
|
||||
return table_data, cols
|
||||
# table_data = df.to_dict('records')
|
||||
|
||||
cands_best_actions = calc_delta_to_next_failure(
|
||||
data=cands,
|
||||
date_feature='ErstellungsDatum',
|
||||
name_delta_feature=NAME_DELTA_FEAT_TO_NEXT_FAILURE,
|
||||
)
|
||||
best_actions_cols, best_actions_table = transform_to_HTML_table(
|
||||
data=cands_best_actions,
|
||||
target_features=TABLE_FEATS_BEST_ACTIONS,
|
||||
date_cols=TABLE_FEATS_DATES,
|
||||
)
|
||||
|
||||
return overview_cols, overview_table, best_actions_cols, best_actions_table
|
||||
|
||||
|
||||
# ** graph callbacks
|
||||
@ -345,7 +450,7 @@ def update_table_candidates(index, obj_id):
|
||||
def display_candidates_as_graph(index, obj_id):
|
||||
error_msg = ''
|
||||
t1 = time.perf_counter()
|
||||
df = pre_filter_data(data, idx=index, obj_id=obj_id)
|
||||
df = filter_candidates(data, idx=index, obj_id=obj_id)
|
||||
t2 = time.perf_counter()
|
||||
print(f'Time for filtering: {t2 - t1} s')
|
||||
|
||||
|
||||
@ -1,9 +1,8 @@
|
||||
import re
|
||||
from collections.abc import Iterable
|
||||
from collections.abc import Collection
|
||||
from itertools import combinations
|
||||
from math import factorial
|
||||
from pathlib import Path
|
||||
from typing import Callable, cast
|
||||
from typing import cast
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
@ -25,11 +24,12 @@ from lang_main.loggers import logger_preprocess as logger
|
||||
from lang_main.pipelines.base import Pipeline
|
||||
from lang_main.types import Embedding, PandasIndex
|
||||
|
||||
# ** RE patterns
|
||||
pattern_special_chars = re.compile(r'[\t\n\r\f\v]+')
|
||||
pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
|
||||
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
|
||||
pattern_whitespace = re.compile(r'[ ]{2,}')
|
||||
# TODO removal
|
||||
# pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
|
||||
# pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
|
||||
# pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
|
||||
# pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
|
||||
# pattern_whitespace = re.compile(r'[ ]{2,}')
|
||||
|
||||
|
||||
# ** (1) dataset preparation: loading and simple preprocessing
|
||||
@ -37,7 +37,7 @@ pattern_whitespace = re.compile(r'[ ]{2,}')
|
||||
# duplicate cleansing based on all properties
|
||||
def load_raw_data(
|
||||
path: Path,
|
||||
date_cols: Iterable[str] = (
|
||||
date_cols: Collection[str] = (
|
||||
'VorgangsDatum',
|
||||
'ErledigungsDatum',
|
||||
'Arbeitsbeginn',
|
||||
@ -50,7 +50,7 @@ def load_raw_data(
|
||||
----------
|
||||
path : str
|
||||
path to dataset file, usually CSV file
|
||||
date_cols : list[str], optional
|
||||
date_cols : Collection[str], optional
|
||||
columns which contain dates and are parsed as such,
|
||||
by default (
|
||||
'VorgangsDatum',
|
||||
@ -129,9 +129,7 @@ def remove_duplicates(
|
||||
|
||||
def remove_NA(
|
||||
data: DataFrame,
|
||||
target_features: list[str] = [
|
||||
'VorgangsBeschreibung',
|
||||
],
|
||||
target_features: Collection[str] = ('VorgangsBeschreibung',),
|
||||
) -> tuple[DataFrame]:
|
||||
"""function to drop NA entries based on a subset of features to be analysed
|
||||
|
||||
@ -139,14 +137,15 @@ def remove_NA(
|
||||
----------
|
||||
data : DataFrame
|
||||
standard IHM dataset, perhaps pre-cleaned
|
||||
target_features : list[str], optional
|
||||
subset to analyse to define an NA entry, by default [ 'VorgangsBeschreibung', ]
|
||||
target_features : Collection[str], optional
|
||||
subset to analyse to define an NA entry, by default ('VorgangsBeschreibung',)
|
||||
|
||||
Returns
|
||||
-------
|
||||
DataFrame
|
||||
dataset with removed NA entries for given subset of features
|
||||
"""
|
||||
target_features = list(target_features)
|
||||
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
|
||||
logger.info(
|
||||
f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
|
||||
@ -156,46 +155,7 @@ def remove_NA(
|
||||
|
||||
|
||||
# ** (2) entry-based cleansing
|
||||
# following functions clean and prepare specific entries, not whole dataset
|
||||
def clean_string_slim(string: str) -> str:
|
||||
"""mapping function to clean single string entries in a series (feature-wise)
|
||||
of the dataset, used to be applied element-wise for string features
|
||||
|
||||
Parameters
|
||||
----------
|
||||
string : str
|
||||
dataset entry feature
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
cleaned entry
|
||||
"""
|
||||
# remove special chars
|
||||
string = pattern_special_chars.sub(' ', string)
|
||||
string = pattern_repeated_chars.sub(r'\1', string)
|
||||
# string = pattern_dates.sub('', string)
|
||||
# dates are used for context, should not be removed at this stage
|
||||
string = pattern_whitespace.sub(' ', string)
|
||||
# remove whitespaces at the beginning and the end
|
||||
string = string.strip()
|
||||
|
||||
return string
|
||||
|
||||
|
||||
def entry_wise_cleansing(
|
||||
data: DataFrame,
|
||||
target_feature: str,
|
||||
cleansing_func: Callable[[str], str],
|
||||
) -> tuple[DataFrame]:
|
||||
# apply given cleansing function to target feature
|
||||
data[target_feature] = data[target_feature].map(cleansing_func)
|
||||
logger.info(
|
||||
('Successfully applied entry-wise cleansing procedure >>%s<< for feature >>%s<<'),
|
||||
cleansing_func.__name__,
|
||||
target_feature,
|
||||
)
|
||||
return (data,)
|
||||
# ** moved to module ``lang_main.analysis.shared``
|
||||
|
||||
|
||||
# ** in-depth analysis of one feature
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
from collections.abc import Iterable, Iterator
|
||||
import re
|
||||
from collections.abc import Callable, Collection, Iterable, Iterator
|
||||
from typing import cast
|
||||
|
||||
import networkx as nx
|
||||
@ -7,14 +8,70 @@ import numpy.typing as npt
|
||||
import sentence_transformers
|
||||
import sentence_transformers.util
|
||||
from networkx import Graph
|
||||
from pandas import Series
|
||||
from pandas import DataFrame, Series
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from torch import Tensor
|
||||
|
||||
from lang_main.analysis.graphs import get_graph_metadata, update_graph
|
||||
from lang_main.loggers import logger_preprocess as logger
|
||||
from lang_main.types import PandasIndex
|
||||
|
||||
# ** RE patterns
|
||||
pattern_escape_newline = re.compile(r'[\n]+')
|
||||
pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
|
||||
pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
|
||||
pattern_repeated_chars = re.compile(r'[,;.:!?\-_+]+(?=[,;.:!?\-_+])')
|
||||
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
|
||||
pattern_whitespace = re.compile(r'[ ]{2,}')
|
||||
|
||||
|
||||
# ** RE applications
|
||||
# following functions clean and prepare specific entries, not whole datasets
|
||||
def clean_string_slim(string: str) -> str:
|
||||
"""mapping function to clean single string entries in a series (feature-wise)
|
||||
of the dataset, used to be applied element-wise for string features
|
||||
|
||||
Parameters
|
||||
----------
|
||||
string : str
|
||||
dataset entry feature
|
||||
|
||||
Returns
|
||||
-------
|
||||
str
|
||||
cleaned entry
|
||||
"""
|
||||
# remove special chars
|
||||
string = pattern_escape_newline.sub('. ', string)
|
||||
string = pattern_escape_seq.sub(' ', string)
|
||||
string = pattern_repeated_chars.sub('', string)
|
||||
# string = pattern_dates.sub('', string)
|
||||
# dates are used for context, should not be removed at this stage
|
||||
string = pattern_whitespace.sub(' ', string)
|
||||
# remove whitespaces at the beginning and the end
|
||||
string = string.strip()
|
||||
|
||||
return string
|
||||
|
||||
|
||||
# ** dataset cleansing
|
||||
def entry_wise_cleansing(
|
||||
data: DataFrame,
|
||||
target_features: Collection[str],
|
||||
cleansing_func: Callable[[str], str] = clean_string_slim,
|
||||
) -> tuple[DataFrame]:
|
||||
# apply given cleansing function to target feature
|
||||
target_features = list(target_features)
|
||||
data[target_features] = data[target_features].map(cleansing_func)
|
||||
logger.info(
|
||||
('Successfully applied entry-wise cleansing procedure >>%s<< for features >>%s<<'),
|
||||
cleansing_func.__name__,
|
||||
target_features,
|
||||
)
|
||||
return (data,)
|
||||
|
||||
|
||||
# ** similarities
|
||||
def candidates_by_index(
|
||||
data_model_input: Series,
|
||||
model: SentenceTransformer,
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from collections.abc import Iterable, Iterator
|
||||
from collections.abc import Collection, Iterable, Iterator
|
||||
from typing import cast
|
||||
|
||||
from pandas import DataFrame, Series
|
||||
@ -7,14 +7,21 @@ from tqdm.auto import tqdm # TODO: check deletion
|
||||
|
||||
from lang_main.analysis.shared import (
|
||||
candidates_by_index,
|
||||
entry_wise_cleansing,
|
||||
pattern_escape_seq_sentences,
|
||||
similar_index_connection_graph,
|
||||
similar_index_groups,
|
||||
)
|
||||
from lang_main.loggers import logger_timeline as logger
|
||||
from lang_main.types import ObjectID, PandasIndex, TimelineCandidates
|
||||
from lang_main.types import (
|
||||
DataFrameTLFiltered,
|
||||
ObjectID,
|
||||
PandasIndex,
|
||||
TimelineCandidates,
|
||||
)
|
||||
|
||||
|
||||
def non_relevant_obj_ids(
|
||||
def _non_relevant_obj_ids(
|
||||
data: DataFrame,
|
||||
thresh_unique_feat_per_id: int,
|
||||
*,
|
||||
@ -50,9 +57,9 @@ def remove_non_relevant_obj_ids(
|
||||
feature_uniqueness: str = 'HObjektText',
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
) -> tuple[DataFrame]:
|
||||
logger.info('Removing non-relevant ObjectIDs from dataset')
|
||||
logger.info('Removing non-relevant ObjectIDs from dataset...')
|
||||
data = data.copy()
|
||||
ids_to_ignore = non_relevant_obj_ids(
|
||||
ids_to_ignore = _non_relevant_obj_ids(
|
||||
data=data,
|
||||
thresh_unique_feat_per_id=thresh_unique_feat_per_id,
|
||||
feature_uniqueness=feature_uniqueness,
|
||||
@ -61,7 +68,43 @@ def remove_non_relevant_obj_ids(
|
||||
# only retain entries with ObjectIDs not in IDs to ignore
|
||||
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
|
||||
logger.debug('Ignored ObjectIDs: %s', ids_to_ignore)
|
||||
logger.info('Non-relevant ObjectIDs removed successfully')
|
||||
logger.info('Non-relevant ObjectIDs removed successfully.')
|
||||
|
||||
return (data,)
|
||||
|
||||
|
||||
def cleanup_descriptions(
|
||||
data: DataFrame,
|
||||
properties: Collection[str] = (
|
||||
'VorgangsBeschreibung',
|
||||
'ErledigungsBeschreibung',
|
||||
),
|
||||
) -> tuple[DataFrame]:
|
||||
logger.info('Cleaning necessary descriptions...')
|
||||
data = data.copy()
|
||||
features = list(properties)
|
||||
data[features] = data[features].fillna('N.V.')
|
||||
(data,) = entry_wise_cleansing(data, target_features=features)
|
||||
logger.info('Cleansing successful.')
|
||||
|
||||
return (data.copy(),)
|
||||
|
||||
|
||||
def calc_delta_to_repair(
|
||||
data: DataFrame,
|
||||
date_feature_start: str = 'ErstellungsDatum',
|
||||
date_feature_end: str = 'ErledigungsDatum',
|
||||
name_delta_feature: str = 'delta_to_repair',
|
||||
convert_to_days: bool = True,
|
||||
) -> tuple[DataFrame]:
|
||||
logger.info('Calculating time differences between start and end of operations...')
|
||||
data = data.copy()
|
||||
data[name_delta_feature] = data[date_feature_end] - data[date_feature_start]
|
||||
|
||||
if convert_to_days:
|
||||
data[name_delta_feature] = data[name_delta_feature].dt.days
|
||||
|
||||
logger.info('Calculation successful.')
|
||||
|
||||
return (data,)
|
||||
|
||||
@ -75,7 +118,7 @@ def generate_model_input(
|
||||
'VorgangsBeschreibung',
|
||||
),
|
||||
) -> tuple[DataFrame]:
|
||||
logger.info('Generating concatenation of model input features')
|
||||
logger.info('Generating concatenation of model input features...')
|
||||
data = data.copy()
|
||||
model_input_features = list(model_input_features)
|
||||
input_features = data[model_input_features].fillna('').astype(str)
|
||||
@ -83,7 +126,7 @@ def generate_model_input(
|
||||
lambda x: ' - '.join(x),
|
||||
axis=1,
|
||||
)
|
||||
logger.info('Model input generated successfully')
|
||||
logger.info('Model input generated successfully.')
|
||||
|
||||
return (data,)
|
||||
|
||||
@ -97,7 +140,7 @@ def filter_activities_per_obj_id(
|
||||
) -> tuple[DataFrame, Series]:
|
||||
data = data.copy()
|
||||
# filter only relevant activities count occurrences for each ObjectID
|
||||
logger.info('Filtering activities per ObjectID')
|
||||
logger.info('Filtering activities per ObjectID...')
|
||||
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
|
||||
data_filter_activities = data.loc[filt_rel_activities].copy()
|
||||
num_activities_per_obj_id = cast(
|
||||
@ -113,7 +156,7 @@ def filter_activities_per_obj_id(
|
||||
|
||||
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
|
||||
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
|
||||
logger.info('Activities per ObjectID filtered successfully')
|
||||
logger.info('Activities per ObjectID filtered successfully.')
|
||||
|
||||
return data_filter_activities, num_activities_per_obj_id
|
||||
|
||||
@ -129,7 +172,7 @@ def filter_activities_per_obj_id(
|
||||
## use idx pairs to get idx values of series
|
||||
|
||||
|
||||
def get_timeline_candidates_index(
|
||||
def _get_timeline_candidates_index(
|
||||
data: DataFrame,
|
||||
num_activities_per_obj_id: Series,
|
||||
*,
|
||||
@ -161,7 +204,7 @@ def get_timeline_candidates_index(
|
||||
|
||||
|
||||
# TODO: check application for duplicate removal
|
||||
def transform_timeline_candidates(
|
||||
def _transform_timeline_candidates(
|
||||
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
|
||||
) -> TimelineCandidates:
|
||||
"""function to build a mapping of ObjectIDs to their respective collection of
|
||||
@ -200,7 +243,7 @@ def transform_timeline_candidates(
|
||||
return candidates_by_obj_id
|
||||
|
||||
|
||||
def map_obj_id_to_texts(
|
||||
def _map_obj_id_to_texts(
|
||||
data: DataFrame,
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
) -> dict[ObjectID, str]:
|
||||
@ -229,7 +272,7 @@ def get_timeline_candidates(
|
||||
model_input_feature: str = 'nlp_model_input',
|
||||
) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
|
||||
logger.info('Obtaining timeline candidates...')
|
||||
candidates = get_timeline_candidates_index(
|
||||
candidates = _get_timeline_candidates_index(
|
||||
data=data,
|
||||
num_activities_per_obj_id=num_activities_per_obj_id,
|
||||
model=model,
|
||||
@ -237,14 +280,52 @@ def get_timeline_candidates(
|
||||
feature_obj_id=feature_obj_id,
|
||||
model_input_feature=model_input_feature,
|
||||
)
|
||||
tl_candidates = transform_timeline_candidates(candidates)
|
||||
tl_candidates = _transform_timeline_candidates(candidates)
|
||||
logger.info('Timeline candidates obtained successfully.')
|
||||
# text mapping to obtain object descriptors
|
||||
logger.info('Mapping ObjectIDs to their respective text descriptor...')
|
||||
map_obj_text = map_obj_id_to_texts(
|
||||
map_obj_text = _map_obj_id_to_texts(
|
||||
data=data,
|
||||
feature_obj_id=feature_obj_id,
|
||||
)
|
||||
logger.info('ObjectIDs successfully mapped to text descriptors.')
|
||||
|
||||
return tl_candidates, map_obj_text
|
||||
|
||||
|
||||
# ** Postprocessing
|
||||
# filter original dataset for a batch of timeline candidates
|
||||
def filter_timeline_cands(
|
||||
data: DataFrame,
|
||||
cands: TimelineCandidates,
|
||||
obj_id: ObjectID,
|
||||
entry_idx: int,
|
||||
sort_feature: str = 'ErstellungsDatum',
|
||||
) -> DataFrameTLFiltered:
|
||||
data = data.copy()
|
||||
cands_for_obj_id = cands[obj_id]
|
||||
cands_choice = cands_for_obj_id[entry_idx]
|
||||
data = data.loc[list(cands_choice)].sort_values(
|
||||
by=sort_feature,
|
||||
ascending=True,
|
||||
)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def calc_delta_to_next_failure(
|
||||
data: DataFrameTLFiltered,
|
||||
date_feature: str = 'ErstellungsDatum',
|
||||
name_delta_feature: str = 'delta_to_next_failure',
|
||||
convert_to_days: bool = True,
|
||||
) -> DataFrameTLFiltered:
|
||||
data = data.copy()
|
||||
last_val = data[date_feature].iat[-1]
|
||||
shifted = data[date_feature].shift(-1, fill_value=last_val)
|
||||
data[name_delta_feature] = shifted - data[date_feature]
|
||||
data = data.sort_values(by=name_delta_feature, ascending=False)
|
||||
|
||||
if convert_to_days:
|
||||
data[name_delta_feature] = data[name_delta_feature].dt.days
|
||||
|
||||
return data
|
||||
|
||||
@ -76,13 +76,14 @@ CYTO_LAYOUT_PROPERTIES: Final[CytoLayoutProperties] = {
|
||||
'isDeterministic': True,
|
||||
'singlePartition': False,
|
||||
}
|
||||
CYTO_SANDBOX_NAME: Final[str] = 'lang_main'
|
||||
CYTO_STYLESHEET_NAME: Final[str] = 'lang_main'
|
||||
# name for property, on which selection is done
|
||||
CYTO_SELECTION_PROPERTY: Final[str] = 'node_selection'
|
||||
CYTO_NUMBER_SUBGRAPHS: Final[int] = 5
|
||||
CYTO_ITER_NEIGHBOUR_DEPTH: Final[int] = 2
|
||||
|
||||
# ** time analysis.uniqueness
|
||||
# ** time_analysis.uniqueness
|
||||
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
|
||||
'threshold_unique_texts'
|
||||
]
|
||||
@ -90,6 +91,10 @@ UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
|
||||
'criterion_feature'
|
||||
]
|
||||
FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
|
||||
# ** time_analysis.preparation
|
||||
NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'delta_to_repair'
|
||||
# NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'Zeitspanne bis zur Behebung [Tage]'
|
||||
NAME_DELTA_FEAT_TO_NEXT_FAILURE: Final[str] = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
|
||||
# ** time_analysis.model_input
|
||||
MODEL_INPUT_FEATURES: Final[tuple[str, ...]] = tuple(
|
||||
CONFIG['time_analysis']['model_input']['input_features']
|
||||
|
||||
Binary file not shown.
@ -3,15 +3,19 @@ from pathlib import Path
|
||||
from lang_main.analysis import graphs
|
||||
from lang_main.analysis.preprocessing import (
|
||||
analyse_feature,
|
||||
clean_string_slim,
|
||||
entry_wise_cleansing,
|
||||
load_raw_data,
|
||||
merge_similarity_dupl,
|
||||
numeric_pre_filter_feature,
|
||||
remove_duplicates,
|
||||
remove_NA,
|
||||
)
|
||||
from lang_main.analysis.shared import (
|
||||
clean_string_slim,
|
||||
entry_wise_cleansing,
|
||||
)
|
||||
from lang_main.analysis.timeline import (
|
||||
calc_delta_to_repair,
|
||||
cleanup_descriptions,
|
||||
filter_activities_per_obj_id,
|
||||
generate_model_input,
|
||||
get_timeline_candidates,
|
||||
@ -25,6 +29,7 @@ from lang_main.constants import (
|
||||
DATE_COLS,
|
||||
FEATURE_NAME_OBJ_ID,
|
||||
MODEL_INPUT_FEATURES,
|
||||
NAME_DELTA_FEAT_TO_REPAIR,
|
||||
SAVE_PATH_FOLDER,
|
||||
SPCY_MODEL,
|
||||
STFR_MODEL,
|
||||
@ -56,7 +61,7 @@ def build_base_target_feature_pipe() -> Pipeline:
|
||||
pipe_target_feat.add(
|
||||
entry_wise_cleansing,
|
||||
{
|
||||
'target_feature': 'VorgangsBeschreibung',
|
||||
'target_feature': ('VorgangsBeschreibung',),
|
||||
'cleansing_func': clean_string_slim,
|
||||
},
|
||||
save_result=True,
|
||||
@ -182,7 +187,6 @@ def build_tk_graph_rescaling_pipe(
|
||||
graphs.pipe_add_graph_metrics,
|
||||
save_result=save_result,
|
||||
filename=exit_point,
|
||||
# filename=EntryPoints.TK_GRAPH_ANALYSIS_RESCALED,
|
||||
)
|
||||
|
||||
return pipe_graph_rescaling
|
||||
@ -247,6 +251,23 @@ def build_tk_graph_render_pipe(
|
||||
# ** timeline analysis
|
||||
def build_timeline_pipe() -> Pipeline:
|
||||
pipe_timeline = Pipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
|
||||
pipe_timeline.add(
|
||||
cleanup_descriptions,
|
||||
{
|
||||
'properties': ['ErledigungsBeschreibung'],
|
||||
},
|
||||
)
|
||||
pipe_timeline.add(
|
||||
calc_delta_to_repair,
|
||||
{
|
||||
'date_feature_start': 'ErstellungsDatum',
|
||||
'date_feature_end': 'ErledigungsDatum',
|
||||
'name_delta_feature': NAME_DELTA_FEAT_TO_REPAIR,
|
||||
'convert_to_days': True,
|
||||
},
|
||||
save_result=True,
|
||||
filename=EntryPoints.TIMELINE_POST,
|
||||
)
|
||||
pipe_timeline.add(
|
||||
remove_non_relevant_obj_ids,
|
||||
{
|
||||
@ -281,7 +302,7 @@ def build_timeline_pipe() -> Pipeline:
|
||||
'model_input_feature': 'nlp_model_input',
|
||||
},
|
||||
save_result=True,
|
||||
filename=EntryPoints.TIMELINE_POST,
|
||||
filename=EntryPoints.TIMELINE_CANDS,
|
||||
)
|
||||
|
||||
return pipe_timeline
|
||||
|
||||
@ -16,6 +16,7 @@ from lang_main.constants import (
|
||||
CYTO_LAYOUT_PROPERTIES,
|
||||
CYTO_NUMBER_SUBGRAPHS,
|
||||
CYTO_PATH_STYLESHEET,
|
||||
CYTO_SANDBOX_NAME,
|
||||
CYTO_SELECTION_PROPERTY,
|
||||
CYTO_STYLESHEET_NAME,
|
||||
PROPERTY_NAME_DEGREE_WEIGHTED,
|
||||
@ -56,6 +57,8 @@ def verify_connection():
|
||||
def import_to_cytoscape(
|
||||
graph: DiGraph | Graph,
|
||||
network_name: str = CYTO_BASE_NETWORK_NAME,
|
||||
sandbox_name: str = CYTO_SANDBOX_NAME,
|
||||
reinitialise_sandbox: bool = True,
|
||||
) -> None:
|
||||
"""Cytoscape: import NetworkX graph as new network collection
|
||||
|
||||
@ -66,6 +69,12 @@ def import_to_cytoscape(
|
||||
"""
|
||||
logger.debug('Checking Cytoscape connection...')
|
||||
verify_connection()
|
||||
logger.debug('Setting Cytoscape sandbox...')
|
||||
p4c.sandbox_set(
|
||||
sandbox_name=sandbox_name,
|
||||
reinitialize=reinitialise_sandbox,
|
||||
copy_samples=False,
|
||||
)
|
||||
logger.debug('Importing to and analysing network in Cytoscape...')
|
||||
p4c.delete_all_networks()
|
||||
p4c.create_network_from_networkx(
|
||||
@ -122,6 +131,7 @@ def export_network_to_image(
|
||||
filetype: CytoExportFileTypes = 'SVG',
|
||||
network_name: str = CYTO_BASE_NETWORK_NAME,
|
||||
pdf_export_page_size: CytoExportPageSizes = 'A4',
|
||||
sandbox_name: str = CYTO_SANDBOX_NAME,
|
||||
) -> None:
|
||||
"""Cytoscape: export current selected view as image
|
||||
|
||||
@ -140,14 +150,17 @@ def export_network_to_image(
|
||||
logger.debug('Exporting image to file...')
|
||||
if not target_folder.exists():
|
||||
target_folder.mkdir(parents=True)
|
||||
file_pth = target_folder / filename
|
||||
dst_file_pth = (target_folder / filename).with_suffix(f'.{filetype.lower()}')
|
||||
|
||||
text_as_font = True
|
||||
if filetype == 'SVG':
|
||||
text_as_font = False
|
||||
|
||||
# image is generated in sandbox directory and transferred to target destination
|
||||
# (preparation for remote instances of Cytoscape)
|
||||
# TODO close non-necessary windows before image display
|
||||
p4c.export_image(
|
||||
filename=str(file_pth),
|
||||
filename=filename,
|
||||
type=filetype,
|
||||
network=network_name,
|
||||
overwrite_file=True,
|
||||
@ -155,7 +168,24 @@ def export_network_to_image(
|
||||
export_text_as_font=text_as_font,
|
||||
page_size=pdf_export_page_size,
|
||||
)
|
||||
logger.debug('Exporting image to file successful.')
|
||||
# TODO change back to Cytoscape 3.10 and above
|
||||
# TODO remove if Cytoscape >= 3.10.* is running in container
|
||||
# p4c.export_image(
|
||||
# filename=filename,
|
||||
# type=filetype,
|
||||
# network=network_name,
|
||||
# overwrite_file=True,
|
||||
# )
|
||||
logger.debug('Exported image to sandbox.')
|
||||
logger.debug('Transferring image from sandbox to target destination...')
|
||||
sandbox_filename = f'{filename}.{filetype.lower()}'
|
||||
p4c.sandbox_get_from(
|
||||
source_file=sandbox_filename,
|
||||
dest_file=str(dst_file_pth),
|
||||
overwrite=True,
|
||||
sandbox_name=sandbox_name,
|
||||
)
|
||||
logger.debug('Transfer of image from sandbox to target destination successful.')
|
||||
|
||||
|
||||
def layout_network(
|
||||
@ -192,6 +222,7 @@ def apply_style_to_network(
|
||||
node_size_property: str = 'node_selection',
|
||||
min_node_size: int = 15,
|
||||
max_node_size: int = 40,
|
||||
sandbox_name: str = CYTO_SANDBOX_NAME,
|
||||
) -> None:
|
||||
"""Cytoscape: apply a chosen Cytoscape style to the defined network
|
||||
|
||||
@ -221,7 +252,16 @@ def apply_style_to_network(
|
||||
raise FileNotFoundError(
|
||||
f'Visual stylesheet for Cytoscape not found under: >>{pth_to_stylesheet}<<'
|
||||
)
|
||||
p4c.import_visual_styles(str(pth_to_stylesheet))
|
||||
# send to sandbox
|
||||
sandbox_filename = pth_to_stylesheet.name
|
||||
p4c.sandbox_send_to(
|
||||
source_file=pth_to_stylesheet,
|
||||
dest_file=sandbox_filename,
|
||||
overwrite=True,
|
||||
sandbox_name=sandbox_name,
|
||||
)
|
||||
# load stylesheet
|
||||
p4c.import_visual_styles(sandbox_filename)
|
||||
|
||||
p4c.set_visual_style(style_name, network=network_name)
|
||||
# node size mapping, only if needed property is available
|
||||
@ -242,6 +282,7 @@ def apply_style_to_network(
|
||||
default_number=min_node_size,
|
||||
)
|
||||
p4c.set_node_size_mapping(**node_size_map)
|
||||
# TODO removal
|
||||
# else:
|
||||
# node_table = p4c.get_table_columns(table='node', network=network_name)
|
||||
# nodes_SUID = node_table['SUID'].to_list()
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
import enum
|
||||
from collections.abc import Hashable
|
||||
from typing import (
|
||||
Any,
|
||||
Literal,
|
||||
Required,
|
||||
TypeAlias,
|
||||
@ -7,6 +9,7 @@ from typing import (
|
||||
)
|
||||
|
||||
import numpy as np
|
||||
from pandas import DataFrame
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
from torch import Tensor
|
||||
|
||||
@ -33,6 +36,7 @@ ResultHandling: TypeAlias = list[tuple[bool, str | None]]
|
||||
class EntryPoints(enum.StrEnum):
|
||||
TIMELINE = 'TIMELINE'
|
||||
TIMELINE_POST = 'TIMELINE_POSTPROCESSING'
|
||||
TIMELINE_CANDS = 'TIMELINE_CANDIDATES'
|
||||
TIMELINE_TK_GRAPH_RESCALED = 'TIMELINE_TK_GRAPH_RESCALED'
|
||||
TK_GRAPH_POST = 'TK-GRAPH_POSTPROCESSING'
|
||||
TK_GRAPH_ANALYSIS = 'TK-GRAPH_ANALYSIS'
|
||||
@ -44,6 +48,7 @@ class EntryPoints(enum.StrEnum):
|
||||
PandasIndex: TypeAlias = int | np.int64
|
||||
ObjectID: TypeAlias = int
|
||||
Embedding: TypeAlias = SpacyDoc | Tensor
|
||||
DataFrameTLFiltered: TypeAlias = DataFrame
|
||||
|
||||
# ** graphs
|
||||
NodeTitle: TypeAlias = str
|
||||
@ -118,3 +123,8 @@ class CytoscapeData(TypedDict, total=False):
|
||||
|
||||
# ** timeline
|
||||
TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
|
||||
|
||||
|
||||
# ** Dash (Dashboard)
|
||||
HTMLTable: TypeAlias = list[dict[Hashable, Any]]
|
||||
HTMLColumns: TypeAlias = list[dict[str, str]]
|
||||
|
||||
File diff suppressed because one or more lines are too long
Loading…
x
Reference in New Issue
Block a user