sandboxing

This commit is contained in:
Florian Förster 2024-08-05 08:43:45 +02:00
parent 9197146d2c
commit 3f58a14852
10 changed files with 2362 additions and 283 deletions

View File

@ -1,9 +1,11 @@
import time import time
import webbrowser import webbrowser
from pathlib import Path from collections.abc import Collection, Iterable
from threading import Thread from threading import Thread
from typing import Any, Final, cast from typing import Any, Final, cast
import pandas as pd
# import dash_cytoscape as cyto # import dash_cytoscape as cyto
import plotly.express as px import plotly.express as px
from dash import ( from dash import (
@ -21,20 +23,37 @@ from plotly.graph_objects import Figure
import lang_main.io import lang_main.io
from lang_main.analysis import graphs, tokens from lang_main.analysis import graphs, tokens
from lang_main.constants import SAVE_PATH_FOLDER, SPCY_MODEL from lang_main.analysis.timeline import (
calc_delta_to_next_failure,
filter_timeline_cands,
)
from lang_main.constants import (
NAME_DELTA_FEAT_TO_NEXT_FAILURE,
NAME_DELTA_FEAT_TO_REPAIR,
SAVE_PATH_FOLDER,
SPCY_MODEL,
)
from lang_main.errors import EmptyEdgesError, EmptyGraphError from lang_main.errors import EmptyEdgesError, EmptyGraphError
from lang_main.pipelines.predefined import ( from lang_main.pipelines.predefined import (
build_tk_graph_render_pipe, build_tk_graph_render_pipe,
build_tk_graph_rescaling_pipe, build_tk_graph_rescaling_pipe,
) )
from lang_main.types import EntryPoints, ObjectID, TimelineCandidates from lang_main.types import (
DataFrameTLFiltered,
EntryPoints,
HTMLColumns,
HTMLTable,
ObjectID,
TimelineCandidates,
)
# ** data # ** data
# p_df = Path(r'../results/test_20240619/TIMELINE.pkl').resolve() # p_df = Path(r'../results/test_20240619/TIMELINE.pkl').resolve()
p_df = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE) p_df = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_POST)
(data,) = cast(tuple[DataFrame], lang_main.io.load_pickle(p_df)) (data,) = cast(tuple[DataFrame], lang_main.io.load_pickle(p_df))
# data = cleanup_descriptions(data, properties=['ErledigungsBeschreibung'])
# p_tl = Path(r'../results/test_20240619/TIMELINE_POSTPROCESSING.pkl').resolve() # p_tl = Path(r'../results/test_20240619/TIMELINE_POSTPROCESSING.pkl').resolve()
p_tl = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_POST) p_tl = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_CANDS)
cands, texts = cast( cands, texts = cast(
tuple[TimelineCandidates, dict[ObjectID, str]], lang_main.io.load_pickle(p_tl) tuple[TimelineCandidates, dict[ObjectID, str]], lang_main.io.load_pickle(p_tl)
) )
@ -56,17 +75,27 @@ PTH_RENDERED_GRAPH = lang_main.io.get_entry_point(
file_ext='.svg', file_ext='.svg',
) )
# NAME_DELTA_FEAT_TO_NEXT_FAILURE: Final[str] = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
TABLE_FEATS: Final[list[str]] = [ TABLE_FEATS_OVERVIEW: Final[list[str]] = [
'ErstellungsDatum', 'ErstellungsDatum',
'ErledigungsDatum', 'ErledigungsDatum',
NAME_DELTA_FEAT_TO_REPAIR,
'VorgangsTypName', 'VorgangsTypName',
'VorgangsBeschreibung', 'VorgangsBeschreibung',
'ErledigungsBeschreibung',
] ]
TABLE_FEATS_DATES: Final[list[str]] = [ TABLE_FEATS_DATES: Final[list[str]] = [
'ErstellungsDatum', 'ErstellungsDatum',
'ErledigungsDatum', 'ErledigungsDatum',
] ]
TABLE_FEATS_BEST_ACTIONS: Final[list[str]] = [
'ErstellungsDatum',
'ErledigungsDatum',
'VorgangsTypName',
'VorgangsBeschreibung',
'ErledigungsBeschreibung',
NAME_DELTA_FEAT_TO_NEXT_FAILURE,
]
# ** figure config # ** figure config
MARKERS_OCCURRENCES: Final[dict[str, Any]] = { MARKERS_OCCURRENCES: Final[dict[str, Any]] = {
@ -86,13 +115,15 @@ HOVER_DATA: Final[dict[str, Any]] = {
'ErstellungsDatum': '|%d.%m.%Y', 'ErstellungsDatum': '|%d.%m.%Y',
'ErledigungsDatum': '|%d.%m.%Y', 'ErledigungsDatum': '|%d.%m.%Y',
'VorgangsBeschreibung': True, 'VorgangsBeschreibung': True,
'ErledigungsBeschreibung': True,
} }
HOVER_DATA_DELTA: Final[dict[str, Any]] = { HOVER_DATA_DELTA: Final[dict[str, Any]] = {
'ErstellungsDatum': '|%d.%m.%Y', 'ErstellungsDatum': '|%d.%m.%Y',
'ErledigungsDatum': '|%d.%m.%Y', 'ErledigungsDatum': '|%d.%m.%Y',
'VorgangsDatum': '|%d.%m.%Y', 'VorgangsDatum': '|%d.%m.%Y',
'delta': True, NAME_DELTA_FEAT_TO_REPAIR: True,
'VorgangsBeschreibung': True, 'VorgangsBeschreibung': True,
'ErledigungsBeschreibung': True,
} }
# ** graph # ** graph
@ -136,10 +167,10 @@ graph_layout = html.Div(
html.Img( html.Img(
id='static-graph-img', id='static-graph-img',
alt='static rendered graph', alt='static rendered graph',
# style={ style={
# 'width': 'auto', 'width': 'auto',
# 'height': 'auto', 'height': 'auto',
# }, },
), ),
html.P(id='info-graph-errors', children=[]), html.P(id='info-graph-errors', children=[]),
], ],
@ -186,7 +217,27 @@ app.layout = html.Div(
] ]
), ),
html.Div( html.Div(
[dash_table.DataTable(id='table-candidates')], style={'marginBottom': '2em'} children=[
html.Div(
[
html.H5('Überblick ähnlicher Vorgänge'),
dash_table.DataTable(id='table-candidates'),
],
style={'paddingBottom': '1em'},
),
html.Div(
[
html.H5(
(
'Maßnahmen sortiert nach längstem Zeitraum '
'bis zum nächsten Ereignis'
)
),
dash_table.DataTable(id='table-best-actions'),
]
),
],
style={'marginBottom': '2em', 'padding': '2em'},
), ),
graph_layout, graph_layout,
], ],
@ -222,20 +273,21 @@ def update_choice_candidates(obj_id):
# ** helpers to filter DataFrame # ** helpers to filter DataFrame
def pre_filter_data( def filter_candidates(
data: DataFrame, data: DataFrame,
idx: int, idx: int,
obj_id: ObjectID, obj_id: ObjectID,
) -> DataFrame: ) -> DataFrameTLFiltered:
# assert correct data type because of Dash
idx = int(idx) idx = int(idx)
obj_id = int(obj_id) obj_id = int(obj_id)
# data = data.copy()
cands_for_obj_id = cands[obj_id] data = filter_timeline_cands(
cands_choice = cands_for_obj_id[int(idx) - 1] data=data,
# data cands=cands,
data = data.loc[list(cands_choice)].sort_index() # type: ignore obj_id=obj_id,
data['delta'] = data['ErledigungsDatum'] - data['ErstellungsDatum'] entry_idx=(idx - 1), # idx in Dashboard starts with 1
data['delta'] = data['delta'].dt.days )
return data return data
@ -258,10 +310,10 @@ def update_timeline(index, obj_id):
obj_text = texts[obj_id] obj_text = texts[obj_id]
title_occurrences = f'HObjektText: {obj_text}' title_occurrences = f'HObjektText: {obj_text}'
title_delta = f'HObjektText: {obj_text}, Differenz Erstellung und Erledigung' title_delta = f'HObjektText: {obj_text}, Differenz Erstellung und Erledigung'
df = pre_filter_data(data, idx=index, obj_id=obj_id) df = filter_candidates(data, idx=index, obj_id=obj_id)
# figure # figure
fig_occurrences = fig_timeline_occurrences(df, title_occurrences) fig_occurrences = fig_timeline_occurrences(df, title_occurrences)
fig_delta = fig_timeline_delta(df, title_delta) fig_delta = fig_timeline_delta(df, title_delta, delta_feature=NAME_DELTA_FEAT_TO_REPAIR)
return fig_occurrences, fig_delta return fig_occurrences, fig_delta
@ -293,11 +345,12 @@ def fig_timeline_occurrences(
def fig_timeline_delta( def fig_timeline_delta(
df: DataFrame, df: DataFrame,
title: str, title: str,
delta_feature: str,
) -> Figure: ) -> Figure:
fig = px.scatter( fig = px.scatter(
data_frame=df, data_frame=df,
x='ErstellungsDatum', x='ErstellungsDatum',
y='delta', y=delta_feature,
title=title, title=title,
hover_data=HOVER_DATA_DELTA, hover_data=HOVER_DATA_DELTA,
) )
@ -309,25 +362,77 @@ def fig_timeline_delta(
return fig return fig
def transform_to_HTML_table(
data: DataFrame,
target_features: Collection[str],
date_cols: Iterable[str] | None = None,
sorting_feature: str | None = None,
sorting_ascending: bool = True,
) -> tuple[HTMLColumns, HTMLTable]:
target_features = list(target_features)
data = data.copy()
data = data.filter(items=target_features, axis=1)
if sorting_feature is not None:
data = data.sort_values(by='ErstellungsDatum', ascending=sorting_ascending)
if date_cols is not None:
for col in date_cols:
data[col] = data[col].dt.strftime(r'%Y-%m-%d')
columns = [{'name': col, 'id': col} for col in data.columns]
table_data = data.to_dict('records')
return columns, table_data
# 'table-best-actions'
# ** HTML table # ** HTML table
@callback( @callback(
[Output('table-candidates', 'data'), Output('table-candidates', 'columns')], [
Output('table-candidates', 'columns'),
Output('table-candidates', 'data'),
Output('table-best-actions', 'columns'),
Output('table-best-actions', 'data'),
],
Input('selector-candidates', 'value'), Input('selector-candidates', 'value'),
State('selector-obj_id', 'value'), State('selector-obj_id', 'value'),
prevent_initial_call=True, prevent_initial_call=True,
) )
def update_table_candidates(index, obj_id): def update_tables_candidates(
df = pre_filter_data(data, idx=index, obj_id=obj_id) index,
df = df.filter(items=TABLE_FEATS, axis=1).sort_values( obj_id,
by='ErstellungsDatum', ascending=True ) -> tuple[HTMLColumns, HTMLTable, HTMLColumns, HTMLTable]:
cands = filter_candidates(data, idx=index, obj_id=obj_id)
overview_cols, overview_table = transform_to_HTML_table(
data=cands,
target_features=TABLE_FEATS_OVERVIEW,
date_cols=TABLE_FEATS_DATES,
sorting_feature='ErstellungsDatum',
sorting_ascending=True,
) )
cols = [{'name': i, 'id': i} for i in df.columns] # df = df.filter(items=TABLE_FEATS_OVERVIEW, axis=1).sort_values(
# convert dates to strings # by='ErstellungsDatum', ascending=True
for col in TABLE_FEATS_DATES: # )
df[col] = df[col].dt.strftime(r'%Y-%m-%d') # cols = [{'name': i, 'id': i} for i in df.columns]
# # convert dates to strings
# for col in TABLE_FEATS_DATES:
# df[col] = df[col].dt.strftime(r'%Y-%m-%d')
table_data = df.to_dict('records') # table_data = df.to_dict('records')
return table_data, cols
cands_best_actions = calc_delta_to_next_failure(
data=cands,
date_feature='ErstellungsDatum',
name_delta_feature=NAME_DELTA_FEAT_TO_NEXT_FAILURE,
)
best_actions_cols, best_actions_table = transform_to_HTML_table(
data=cands_best_actions,
target_features=TABLE_FEATS_BEST_ACTIONS,
date_cols=TABLE_FEATS_DATES,
)
return overview_cols, overview_table, best_actions_cols, best_actions_table
# ** graph callbacks # ** graph callbacks
@ -345,7 +450,7 @@ def update_table_candidates(index, obj_id):
def display_candidates_as_graph(index, obj_id): def display_candidates_as_graph(index, obj_id):
error_msg = '' error_msg = ''
t1 = time.perf_counter() t1 = time.perf_counter()
df = pre_filter_data(data, idx=index, obj_id=obj_id) df = filter_candidates(data, idx=index, obj_id=obj_id)
t2 = time.perf_counter() t2 = time.perf_counter()
print(f'Time for filtering: {t2 - t1} s') print(f'Time for filtering: {t2 - t1} s')

View File

@ -1,9 +1,8 @@
import re from collections.abc import Collection
from collections.abc import Iterable
from itertools import combinations from itertools import combinations
from math import factorial from math import factorial
from pathlib import Path from pathlib import Path
from typing import Callable, cast from typing import cast
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -25,11 +24,12 @@ from lang_main.loggers import logger_preprocess as logger
from lang_main.pipelines.base import Pipeline from lang_main.pipelines.base import Pipeline
from lang_main.types import Embedding, PandasIndex from lang_main.types import Embedding, PandasIndex
# ** RE patterns # TODO removal
pattern_special_chars = re.compile(r'[\t\n\r\f\v]+') # pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}') # pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?') # pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
pattern_whitespace = re.compile(r'[ ]{2,}') # pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
# pattern_whitespace = re.compile(r'[ ]{2,}')
# ** (1) dataset preparation: loading and simple preprocessing # ** (1) dataset preparation: loading and simple preprocessing
@ -37,7 +37,7 @@ pattern_whitespace = re.compile(r'[ ]{2,}')
# duplicate cleansing based on all properties # duplicate cleansing based on all properties
def load_raw_data( def load_raw_data(
path: Path, path: Path,
date_cols: Iterable[str] = ( date_cols: Collection[str] = (
'VorgangsDatum', 'VorgangsDatum',
'ErledigungsDatum', 'ErledigungsDatum',
'Arbeitsbeginn', 'Arbeitsbeginn',
@ -50,7 +50,7 @@ def load_raw_data(
---------- ----------
path : str path : str
path to dataset file, usually CSV file path to dataset file, usually CSV file
date_cols : list[str], optional date_cols : Collection[str], optional
columns which contain dates and are parsed as such, columns which contain dates and are parsed as such,
by default ( by default (
'VorgangsDatum', 'VorgangsDatum',
@ -129,9 +129,7 @@ def remove_duplicates(
def remove_NA( def remove_NA(
data: DataFrame, data: DataFrame,
target_features: list[str] = [ target_features: Collection[str] = ('VorgangsBeschreibung',),
'VorgangsBeschreibung',
],
) -> tuple[DataFrame]: ) -> tuple[DataFrame]:
"""function to drop NA entries based on a subset of features to be analysed """function to drop NA entries based on a subset of features to be analysed
@ -139,14 +137,15 @@ def remove_NA(
---------- ----------
data : DataFrame data : DataFrame
standard IHM dataset, perhaps pre-cleaned standard IHM dataset, perhaps pre-cleaned
target_features : list[str], optional target_features : Collection[str], optional
subset to analyse to define an NA entry, by default [ 'VorgangsBeschreibung', ] subset to analyse to define an NA entry, by default ('VorgangsBeschreibung',)
Returns Returns
------- -------
DataFrame DataFrame
dataset with removed NA entries for given subset of features dataset with removed NA entries for given subset of features
""" """
target_features = list(target_features)
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
logger.info( logger.info(
f'Removed NA entries for features >>{target_features}<< from dataset successfully.' f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
@ -156,46 +155,7 @@ def remove_NA(
# ** (2) entry-based cleansing # ** (2) entry-based cleansing
# following functions clean and prepare specific entries, not whole dataset # ** moved to module ``lang_main.analysis.shared``
def clean_string_slim(string: str) -> str:
"""mapping function to clean single string entries in a series (feature-wise)
of the dataset, used to be applied element-wise for string features
Parameters
----------
string : str
dataset entry feature
Returns
-------
str
cleaned entry
"""
# remove special chars
string = pattern_special_chars.sub(' ', string)
string = pattern_repeated_chars.sub(r'\1', string)
# string = pattern_dates.sub('', string)
# dates are used for context, should not be removed at this stage
string = pattern_whitespace.sub(' ', string)
# remove whitespaces at the beginning and the end
string = string.strip()
return string
def entry_wise_cleansing(
data: DataFrame,
target_feature: str,
cleansing_func: Callable[[str], str],
) -> tuple[DataFrame]:
# apply given cleansing function to target feature
data[target_feature] = data[target_feature].map(cleansing_func)
logger.info(
('Successfully applied entry-wise cleansing procedure >>%s<< for feature >>%s<<'),
cleansing_func.__name__,
target_feature,
)
return (data,)
# ** in-depth analysis of one feature # ** in-depth analysis of one feature

View File

@ -1,4 +1,5 @@
from collections.abc import Iterable, Iterator import re
from collections.abc import Callable, Collection, Iterable, Iterator
from typing import cast from typing import cast
import networkx as nx import networkx as nx
@ -7,14 +8,70 @@ import numpy.typing as npt
import sentence_transformers import sentence_transformers
import sentence_transformers.util import sentence_transformers.util
from networkx import Graph from networkx import Graph
from pandas import Series from pandas import DataFrame, Series
from sentence_transformers import SentenceTransformer from sentence_transformers import SentenceTransformer
from torch import Tensor from torch import Tensor
from lang_main.analysis.graphs import get_graph_metadata, update_graph from lang_main.analysis.graphs import get_graph_metadata, update_graph
from lang_main.loggers import logger_preprocess as logger
from lang_main.types import PandasIndex from lang_main.types import PandasIndex
# ** RE patterns
pattern_escape_newline = re.compile(r'[\n]+')
pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
pattern_repeated_chars = re.compile(r'[,;.:!?\-_+]+(?=[,;.:!?\-_+])')
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
pattern_whitespace = re.compile(r'[ ]{2,}')
# ** RE applications
# following functions clean and prepare specific entries, not whole datasets
def clean_string_slim(string: str) -> str:
"""mapping function to clean single string entries in a series (feature-wise)
of the dataset, used to be applied element-wise for string features
Parameters
----------
string : str
dataset entry feature
Returns
-------
str
cleaned entry
"""
# remove special chars
string = pattern_escape_newline.sub('. ', string)
string = pattern_escape_seq.sub(' ', string)
string = pattern_repeated_chars.sub('', string)
# string = pattern_dates.sub('', string)
# dates are used for context, should not be removed at this stage
string = pattern_whitespace.sub(' ', string)
# remove whitespaces at the beginning and the end
string = string.strip()
return string
# ** dataset cleansing
def entry_wise_cleansing(
data: DataFrame,
target_features: Collection[str],
cleansing_func: Callable[[str], str] = clean_string_slim,
) -> tuple[DataFrame]:
# apply given cleansing function to target feature
target_features = list(target_features)
data[target_features] = data[target_features].map(cleansing_func)
logger.info(
('Successfully applied entry-wise cleansing procedure >>%s<< for features >>%s<<'),
cleansing_func.__name__,
target_features,
)
return (data,)
# ** similarities
def candidates_by_index( def candidates_by_index(
data_model_input: Series, data_model_input: Series,
model: SentenceTransformer, model: SentenceTransformer,

View File

@ -1,4 +1,4 @@
from collections.abc import Iterable, Iterator from collections.abc import Collection, Iterable, Iterator
from typing import cast from typing import cast
from pandas import DataFrame, Series from pandas import DataFrame, Series
@ -7,14 +7,21 @@ from tqdm.auto import tqdm # TODO: check deletion
from lang_main.analysis.shared import ( from lang_main.analysis.shared import (
candidates_by_index, candidates_by_index,
entry_wise_cleansing,
pattern_escape_seq_sentences,
similar_index_connection_graph, similar_index_connection_graph,
similar_index_groups, similar_index_groups,
) )
from lang_main.loggers import logger_timeline as logger from lang_main.loggers import logger_timeline as logger
from lang_main.types import ObjectID, PandasIndex, TimelineCandidates from lang_main.types import (
DataFrameTLFiltered,
ObjectID,
PandasIndex,
TimelineCandidates,
)
def non_relevant_obj_ids( def _non_relevant_obj_ids(
data: DataFrame, data: DataFrame,
thresh_unique_feat_per_id: int, thresh_unique_feat_per_id: int,
*, *,
@ -50,9 +57,9 @@ def remove_non_relevant_obj_ids(
feature_uniqueness: str = 'HObjektText', feature_uniqueness: str = 'HObjektText',
feature_obj_id: str = 'ObjektID', feature_obj_id: str = 'ObjektID',
) -> tuple[DataFrame]: ) -> tuple[DataFrame]:
logger.info('Removing non-relevant ObjectIDs from dataset') logger.info('Removing non-relevant ObjectIDs from dataset...')
data = data.copy() data = data.copy()
ids_to_ignore = non_relevant_obj_ids( ids_to_ignore = _non_relevant_obj_ids(
data=data, data=data,
thresh_unique_feat_per_id=thresh_unique_feat_per_id, thresh_unique_feat_per_id=thresh_unique_feat_per_id,
feature_uniqueness=feature_uniqueness, feature_uniqueness=feature_uniqueness,
@ -61,7 +68,43 @@ def remove_non_relevant_obj_ids(
# only retain entries with ObjectIDs not in IDs to ignore # only retain entries with ObjectIDs not in IDs to ignore
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))] data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
logger.debug('Ignored ObjectIDs: %s', ids_to_ignore) logger.debug('Ignored ObjectIDs: %s', ids_to_ignore)
logger.info('Non-relevant ObjectIDs removed successfully') logger.info('Non-relevant ObjectIDs removed successfully.')
return (data,)
def cleanup_descriptions(
data: DataFrame,
properties: Collection[str] = (
'VorgangsBeschreibung',
'ErledigungsBeschreibung',
),
) -> tuple[DataFrame]:
logger.info('Cleaning necessary descriptions...')
data = data.copy()
features = list(properties)
data[features] = data[features].fillna('N.V.')
(data,) = entry_wise_cleansing(data, target_features=features)
logger.info('Cleansing successful.')
return (data.copy(),)
def calc_delta_to_repair(
data: DataFrame,
date_feature_start: str = 'ErstellungsDatum',
date_feature_end: str = 'ErledigungsDatum',
name_delta_feature: str = 'delta_to_repair',
convert_to_days: bool = True,
) -> tuple[DataFrame]:
logger.info('Calculating time differences between start and end of operations...')
data = data.copy()
data[name_delta_feature] = data[date_feature_end] - data[date_feature_start]
if convert_to_days:
data[name_delta_feature] = data[name_delta_feature].dt.days
logger.info('Calculation successful.')
return (data,) return (data,)
@ -75,7 +118,7 @@ def generate_model_input(
'VorgangsBeschreibung', 'VorgangsBeschreibung',
), ),
) -> tuple[DataFrame]: ) -> tuple[DataFrame]:
logger.info('Generating concatenation of model input features') logger.info('Generating concatenation of model input features...')
data = data.copy() data = data.copy()
model_input_features = list(model_input_features) model_input_features = list(model_input_features)
input_features = data[model_input_features].fillna('').astype(str) input_features = data[model_input_features].fillna('').astype(str)
@ -83,7 +126,7 @@ def generate_model_input(
lambda x: ' - '.join(x), lambda x: ' - '.join(x),
axis=1, axis=1,
) )
logger.info('Model input generated successfully') logger.info('Model input generated successfully.')
return (data,) return (data,)
@ -97,7 +140,7 @@ def filter_activities_per_obj_id(
) -> tuple[DataFrame, Series]: ) -> tuple[DataFrame, Series]:
data = data.copy() data = data.copy()
# filter only relevant activities count occurrences for each ObjectID # filter only relevant activities count occurrences for each ObjectID
logger.info('Filtering activities per ObjectID') logger.info('Filtering activities per ObjectID...')
filt_rel_activities = data[activity_feature].isin(relevant_activity_types) filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
data_filter_activities = data.loc[filt_rel_activities].copy() data_filter_activities = data.loc[filt_rel_activities].copy()
num_activities_per_obj_id = cast( num_activities_per_obj_id = cast(
@ -113,7 +156,7 @@ def filter_activities_per_obj_id(
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh] num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh] data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
logger.info('Activities per ObjectID filtered successfully') logger.info('Activities per ObjectID filtered successfully.')
return data_filter_activities, num_activities_per_obj_id return data_filter_activities, num_activities_per_obj_id
@ -129,7 +172,7 @@ def filter_activities_per_obj_id(
## use idx pairs to get idx values of series ## use idx pairs to get idx values of series
def get_timeline_candidates_index( def _get_timeline_candidates_index(
data: DataFrame, data: DataFrame,
num_activities_per_obj_id: Series, num_activities_per_obj_id: Series,
*, *,
@ -161,7 +204,7 @@ def get_timeline_candidates_index(
# TODO: check application for duplicate removal # TODO: check application for duplicate removal
def transform_timeline_candidates( def _transform_timeline_candidates(
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]], candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
) -> TimelineCandidates: ) -> TimelineCandidates:
"""function to build a mapping of ObjectIDs to their respective collection of """function to build a mapping of ObjectIDs to their respective collection of
@ -200,7 +243,7 @@ def transform_timeline_candidates(
return candidates_by_obj_id return candidates_by_obj_id
def map_obj_id_to_texts( def _map_obj_id_to_texts(
data: DataFrame, data: DataFrame,
feature_obj_id: str = 'ObjektID', feature_obj_id: str = 'ObjektID',
) -> dict[ObjectID, str]: ) -> dict[ObjectID, str]:
@ -229,7 +272,7 @@ def get_timeline_candidates(
model_input_feature: str = 'nlp_model_input', model_input_feature: str = 'nlp_model_input',
) -> tuple[TimelineCandidates, dict[ObjectID, str]]: ) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
logger.info('Obtaining timeline candidates...') logger.info('Obtaining timeline candidates...')
candidates = get_timeline_candidates_index( candidates = _get_timeline_candidates_index(
data=data, data=data,
num_activities_per_obj_id=num_activities_per_obj_id, num_activities_per_obj_id=num_activities_per_obj_id,
model=model, model=model,
@ -237,14 +280,52 @@ def get_timeline_candidates(
feature_obj_id=feature_obj_id, feature_obj_id=feature_obj_id,
model_input_feature=model_input_feature, model_input_feature=model_input_feature,
) )
tl_candidates = transform_timeline_candidates(candidates) tl_candidates = _transform_timeline_candidates(candidates)
logger.info('Timeline candidates obtained successfully.') logger.info('Timeline candidates obtained successfully.')
# text mapping to obtain object descriptors # text mapping to obtain object descriptors
logger.info('Mapping ObjectIDs to their respective text descriptor...') logger.info('Mapping ObjectIDs to their respective text descriptor...')
map_obj_text = map_obj_id_to_texts( map_obj_text = _map_obj_id_to_texts(
data=data, data=data,
feature_obj_id=feature_obj_id, feature_obj_id=feature_obj_id,
) )
logger.info('ObjectIDs successfully mapped to text descriptors.') logger.info('ObjectIDs successfully mapped to text descriptors.')
return tl_candidates, map_obj_text return tl_candidates, map_obj_text
# ** Postprocessing
# filter original dataset for a batch of timeline candidates
def filter_timeline_cands(
data: DataFrame,
cands: TimelineCandidates,
obj_id: ObjectID,
entry_idx: int,
sort_feature: str = 'ErstellungsDatum',
) -> DataFrameTLFiltered:
data = data.copy()
cands_for_obj_id = cands[obj_id]
cands_choice = cands_for_obj_id[entry_idx]
data = data.loc[list(cands_choice)].sort_values(
by=sort_feature,
ascending=True,
)
return data
def calc_delta_to_next_failure(
data: DataFrameTLFiltered,
date_feature: str = 'ErstellungsDatum',
name_delta_feature: str = 'delta_to_next_failure',
convert_to_days: bool = True,
) -> DataFrameTLFiltered:
data = data.copy()
last_val = data[date_feature].iat[-1]
shifted = data[date_feature].shift(-1, fill_value=last_val)
data[name_delta_feature] = shifted - data[date_feature]
data = data.sort_values(by=name_delta_feature, ascending=False)
if convert_to_days:
data[name_delta_feature] = data[name_delta_feature].dt.days
return data

View File

@ -76,13 +76,14 @@ CYTO_LAYOUT_PROPERTIES: Final[CytoLayoutProperties] = {
'isDeterministic': True, 'isDeterministic': True,
'singlePartition': False, 'singlePartition': False,
} }
CYTO_SANDBOX_NAME: Final[str] = 'lang_main'
CYTO_STYLESHEET_NAME: Final[str] = 'lang_main' CYTO_STYLESHEET_NAME: Final[str] = 'lang_main'
# name for property, on which selection is done # name for property, on which selection is done
CYTO_SELECTION_PROPERTY: Final[str] = 'node_selection' CYTO_SELECTION_PROPERTY: Final[str] = 'node_selection'
CYTO_NUMBER_SUBGRAPHS: Final[int] = 5 CYTO_NUMBER_SUBGRAPHS: Final[int] = 5
CYTO_ITER_NEIGHBOUR_DEPTH: Final[int] = 2 CYTO_ITER_NEIGHBOUR_DEPTH: Final[int] = 2
# ** time analysis.uniqueness # ** time_analysis.uniqueness
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][ THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
'threshold_unique_texts' 'threshold_unique_texts'
] ]
@ -90,6 +91,10 @@ UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
'criterion_feature' 'criterion_feature'
] ]
FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id'] FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
# ** time_analysis.preparation
NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'delta_to_repair'
# NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'Zeitspanne bis zur Behebung [Tage]'
NAME_DELTA_FEAT_TO_NEXT_FAILURE: Final[str] = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
# ** time_analysis.model_input # ** time_analysis.model_input
MODEL_INPUT_FEATURES: Final[tuple[str, ...]] = tuple( MODEL_INPUT_FEATURES: Final[tuple[str, ...]] = tuple(
CONFIG['time_analysis']['model_input']['input_features'] CONFIG['time_analysis']['model_input']['input_features']

View File

@ -3,15 +3,19 @@ from pathlib import Path
from lang_main.analysis import graphs from lang_main.analysis import graphs
from lang_main.analysis.preprocessing import ( from lang_main.analysis.preprocessing import (
analyse_feature, analyse_feature,
clean_string_slim,
entry_wise_cleansing,
load_raw_data, load_raw_data,
merge_similarity_dupl, merge_similarity_dupl,
numeric_pre_filter_feature, numeric_pre_filter_feature,
remove_duplicates, remove_duplicates,
remove_NA, remove_NA,
) )
from lang_main.analysis.shared import (
clean_string_slim,
entry_wise_cleansing,
)
from lang_main.analysis.timeline import ( from lang_main.analysis.timeline import (
calc_delta_to_repair,
cleanup_descriptions,
filter_activities_per_obj_id, filter_activities_per_obj_id,
generate_model_input, generate_model_input,
get_timeline_candidates, get_timeline_candidates,
@ -25,6 +29,7 @@ from lang_main.constants import (
DATE_COLS, DATE_COLS,
FEATURE_NAME_OBJ_ID, FEATURE_NAME_OBJ_ID,
MODEL_INPUT_FEATURES, MODEL_INPUT_FEATURES,
NAME_DELTA_FEAT_TO_REPAIR,
SAVE_PATH_FOLDER, SAVE_PATH_FOLDER,
SPCY_MODEL, SPCY_MODEL,
STFR_MODEL, STFR_MODEL,
@ -56,7 +61,7 @@ def build_base_target_feature_pipe() -> Pipeline:
pipe_target_feat.add( pipe_target_feat.add(
entry_wise_cleansing, entry_wise_cleansing,
{ {
'target_feature': 'VorgangsBeschreibung', 'target_feature': ('VorgangsBeschreibung',),
'cleansing_func': clean_string_slim, 'cleansing_func': clean_string_slim,
}, },
save_result=True, save_result=True,
@ -182,7 +187,6 @@ def build_tk_graph_rescaling_pipe(
graphs.pipe_add_graph_metrics, graphs.pipe_add_graph_metrics,
save_result=save_result, save_result=save_result,
filename=exit_point, filename=exit_point,
# filename=EntryPoints.TK_GRAPH_ANALYSIS_RESCALED,
) )
return pipe_graph_rescaling return pipe_graph_rescaling
@ -247,6 +251,23 @@ def build_tk_graph_render_pipe(
# ** timeline analysis # ** timeline analysis
def build_timeline_pipe() -> Pipeline: def build_timeline_pipe() -> Pipeline:
pipe_timeline = Pipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER) pipe_timeline = Pipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
pipe_timeline.add(
cleanup_descriptions,
{
'properties': ['ErledigungsBeschreibung'],
},
)
pipe_timeline.add(
calc_delta_to_repair,
{
'date_feature_start': 'ErstellungsDatum',
'date_feature_end': 'ErledigungsDatum',
'name_delta_feature': NAME_DELTA_FEAT_TO_REPAIR,
'convert_to_days': True,
},
save_result=True,
filename=EntryPoints.TIMELINE_POST,
)
pipe_timeline.add( pipe_timeline.add(
remove_non_relevant_obj_ids, remove_non_relevant_obj_ids,
{ {
@ -281,7 +302,7 @@ def build_timeline_pipe() -> Pipeline:
'model_input_feature': 'nlp_model_input', 'model_input_feature': 'nlp_model_input',
}, },
save_result=True, save_result=True,
filename=EntryPoints.TIMELINE_POST, filename=EntryPoints.TIMELINE_CANDS,
) )
return pipe_timeline return pipe_timeline

View File

@ -16,6 +16,7 @@ from lang_main.constants import (
CYTO_LAYOUT_PROPERTIES, CYTO_LAYOUT_PROPERTIES,
CYTO_NUMBER_SUBGRAPHS, CYTO_NUMBER_SUBGRAPHS,
CYTO_PATH_STYLESHEET, CYTO_PATH_STYLESHEET,
CYTO_SANDBOX_NAME,
CYTO_SELECTION_PROPERTY, CYTO_SELECTION_PROPERTY,
CYTO_STYLESHEET_NAME, CYTO_STYLESHEET_NAME,
PROPERTY_NAME_DEGREE_WEIGHTED, PROPERTY_NAME_DEGREE_WEIGHTED,
@ -56,6 +57,8 @@ def verify_connection():
def import_to_cytoscape( def import_to_cytoscape(
graph: DiGraph | Graph, graph: DiGraph | Graph,
network_name: str = CYTO_BASE_NETWORK_NAME, network_name: str = CYTO_BASE_NETWORK_NAME,
sandbox_name: str = CYTO_SANDBOX_NAME,
reinitialise_sandbox: bool = True,
) -> None: ) -> None:
"""Cytoscape: import NetworkX graph as new network collection """Cytoscape: import NetworkX graph as new network collection
@ -66,6 +69,12 @@ def import_to_cytoscape(
""" """
logger.debug('Checking Cytoscape connection...') logger.debug('Checking Cytoscape connection...')
verify_connection() verify_connection()
logger.debug('Setting Cytoscape sandbox...')
p4c.sandbox_set(
sandbox_name=sandbox_name,
reinitialize=reinitialise_sandbox,
copy_samples=False,
)
logger.debug('Importing to and analysing network in Cytoscape...') logger.debug('Importing to and analysing network in Cytoscape...')
p4c.delete_all_networks() p4c.delete_all_networks()
p4c.create_network_from_networkx( p4c.create_network_from_networkx(
@ -122,6 +131,7 @@ def export_network_to_image(
filetype: CytoExportFileTypes = 'SVG', filetype: CytoExportFileTypes = 'SVG',
network_name: str = CYTO_BASE_NETWORK_NAME, network_name: str = CYTO_BASE_NETWORK_NAME,
pdf_export_page_size: CytoExportPageSizes = 'A4', pdf_export_page_size: CytoExportPageSizes = 'A4',
sandbox_name: str = CYTO_SANDBOX_NAME,
) -> None: ) -> None:
"""Cytoscape: export current selected view as image """Cytoscape: export current selected view as image
@ -140,14 +150,17 @@ def export_network_to_image(
logger.debug('Exporting image to file...') logger.debug('Exporting image to file...')
if not target_folder.exists(): if not target_folder.exists():
target_folder.mkdir(parents=True) target_folder.mkdir(parents=True)
file_pth = target_folder / filename dst_file_pth = (target_folder / filename).with_suffix(f'.{filetype.lower()}')
text_as_font = True text_as_font = True
if filetype == 'SVG': if filetype == 'SVG':
text_as_font = False text_as_font = False
# image is generated in sandbox directory and transferred to target destination
# (preparation for remote instances of Cytoscape)
# TODO close non-necessary windows before image display
p4c.export_image( p4c.export_image(
filename=str(file_pth), filename=filename,
type=filetype, type=filetype,
network=network_name, network=network_name,
overwrite_file=True, overwrite_file=True,
@ -155,7 +168,24 @@ def export_network_to_image(
export_text_as_font=text_as_font, export_text_as_font=text_as_font,
page_size=pdf_export_page_size, page_size=pdf_export_page_size,
) )
logger.debug('Exporting image to file successful.') # TODO change back to Cytoscape 3.10 and above
# TODO remove if Cytoscape >= 3.10.* is running in container
# p4c.export_image(
# filename=filename,
# type=filetype,
# network=network_name,
# overwrite_file=True,
# )
logger.debug('Exported image to sandbox.')
logger.debug('Transferring image from sandbox to target destination...')
sandbox_filename = f'{filename}.{filetype.lower()}'
p4c.sandbox_get_from(
source_file=sandbox_filename,
dest_file=str(dst_file_pth),
overwrite=True,
sandbox_name=sandbox_name,
)
logger.debug('Transfer of image from sandbox to target destination successful.')
def layout_network( def layout_network(
@ -192,6 +222,7 @@ def apply_style_to_network(
node_size_property: str = 'node_selection', node_size_property: str = 'node_selection',
min_node_size: int = 15, min_node_size: int = 15,
max_node_size: int = 40, max_node_size: int = 40,
sandbox_name: str = CYTO_SANDBOX_NAME,
) -> None: ) -> None:
"""Cytoscape: apply a chosen Cytoscape style to the defined network """Cytoscape: apply a chosen Cytoscape style to the defined network
@ -221,7 +252,16 @@ def apply_style_to_network(
raise FileNotFoundError( raise FileNotFoundError(
f'Visual stylesheet for Cytoscape not found under: >>{pth_to_stylesheet}<<' f'Visual stylesheet for Cytoscape not found under: >>{pth_to_stylesheet}<<'
) )
p4c.import_visual_styles(str(pth_to_stylesheet)) # send to sandbox
sandbox_filename = pth_to_stylesheet.name
p4c.sandbox_send_to(
source_file=pth_to_stylesheet,
dest_file=sandbox_filename,
overwrite=True,
sandbox_name=sandbox_name,
)
# load stylesheet
p4c.import_visual_styles(sandbox_filename)
p4c.set_visual_style(style_name, network=network_name) p4c.set_visual_style(style_name, network=network_name)
# node size mapping, only if needed property is available # node size mapping, only if needed property is available
@ -242,6 +282,7 @@ def apply_style_to_network(
default_number=min_node_size, default_number=min_node_size,
) )
p4c.set_node_size_mapping(**node_size_map) p4c.set_node_size_mapping(**node_size_map)
# TODO removal
# else: # else:
# node_table = p4c.get_table_columns(table='node', network=network_name) # node_table = p4c.get_table_columns(table='node', network=network_name)
# nodes_SUID = node_table['SUID'].to_list() # nodes_SUID = node_table['SUID'].to_list()

View File

@ -1,5 +1,7 @@
import enum import enum
from collections.abc import Hashable
from typing import ( from typing import (
Any,
Literal, Literal,
Required, Required,
TypeAlias, TypeAlias,
@ -7,6 +9,7 @@ from typing import (
) )
import numpy as np import numpy as np
from pandas import DataFrame
from spacy.tokens.doc import Doc as SpacyDoc from spacy.tokens.doc import Doc as SpacyDoc
from torch import Tensor from torch import Tensor
@ -33,6 +36,7 @@ ResultHandling: TypeAlias = list[tuple[bool, str | None]]
class EntryPoints(enum.StrEnum): class EntryPoints(enum.StrEnum):
TIMELINE = 'TIMELINE' TIMELINE = 'TIMELINE'
TIMELINE_POST = 'TIMELINE_POSTPROCESSING' TIMELINE_POST = 'TIMELINE_POSTPROCESSING'
TIMELINE_CANDS = 'TIMELINE_CANDIDATES'
TIMELINE_TK_GRAPH_RESCALED = 'TIMELINE_TK_GRAPH_RESCALED' TIMELINE_TK_GRAPH_RESCALED = 'TIMELINE_TK_GRAPH_RESCALED'
TK_GRAPH_POST = 'TK-GRAPH_POSTPROCESSING' TK_GRAPH_POST = 'TK-GRAPH_POSTPROCESSING'
TK_GRAPH_ANALYSIS = 'TK-GRAPH_ANALYSIS' TK_GRAPH_ANALYSIS = 'TK-GRAPH_ANALYSIS'
@ -44,6 +48,7 @@ class EntryPoints(enum.StrEnum):
PandasIndex: TypeAlias = int | np.int64 PandasIndex: TypeAlias = int | np.int64
ObjectID: TypeAlias = int ObjectID: TypeAlias = int
Embedding: TypeAlias = SpacyDoc | Tensor Embedding: TypeAlias = SpacyDoc | Tensor
DataFrameTLFiltered: TypeAlias = DataFrame
# ** graphs # ** graphs
NodeTitle: TypeAlias = str NodeTitle: TypeAlias = str
@ -118,3 +123,8 @@ class CytoscapeData(TypedDict, total=False):
# ** timeline # ** timeline
TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]] TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
# ** Dash (Dashboard)
HTMLTable: TypeAlias = list[dict[Hashable, Any]]
HTMLColumns: TypeAlias = list[dict[str, str]]

File diff suppressed because one or more lines are too long