sandboxing

This commit is contained in:
Florian Förster 2024-08-05 08:43:45 +02:00
parent 9197146d2c
commit 3f58a14852
10 changed files with 2362 additions and 283 deletions

View File

@ -1,9 +1,11 @@
import time
import webbrowser
from pathlib import Path
from collections.abc import Collection, Iterable
from threading import Thread
from typing import Any, Final, cast
import pandas as pd
# import dash_cytoscape as cyto
import plotly.express as px
from dash import (
@ -21,20 +23,37 @@ from plotly.graph_objects import Figure
import lang_main.io
from lang_main.analysis import graphs, tokens
from lang_main.constants import SAVE_PATH_FOLDER, SPCY_MODEL
from lang_main.analysis.timeline import (
calc_delta_to_next_failure,
filter_timeline_cands,
)
from lang_main.constants import (
NAME_DELTA_FEAT_TO_NEXT_FAILURE,
NAME_DELTA_FEAT_TO_REPAIR,
SAVE_PATH_FOLDER,
SPCY_MODEL,
)
from lang_main.errors import EmptyEdgesError, EmptyGraphError
from lang_main.pipelines.predefined import (
build_tk_graph_render_pipe,
build_tk_graph_rescaling_pipe,
)
from lang_main.types import EntryPoints, ObjectID, TimelineCandidates
from lang_main.types import (
DataFrameTLFiltered,
EntryPoints,
HTMLColumns,
HTMLTable,
ObjectID,
TimelineCandidates,
)
# ** data
# p_df = Path(r'../results/test_20240619/TIMELINE.pkl').resolve()
p_df = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE)
p_df = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_POST)
(data,) = cast(tuple[DataFrame], lang_main.io.load_pickle(p_df))
# data = cleanup_descriptions(data, properties=['ErledigungsBeschreibung'])
# p_tl = Path(r'../results/test_20240619/TIMELINE_POSTPROCESSING.pkl').resolve()
p_tl = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_POST)
p_tl = lang_main.io.get_entry_point(SAVE_PATH_FOLDER, EntryPoints.TIMELINE_CANDS)
cands, texts = cast(
tuple[TimelineCandidates, dict[ObjectID, str]], lang_main.io.load_pickle(p_tl)
)
@ -56,17 +75,27 @@ PTH_RENDERED_GRAPH = lang_main.io.get_entry_point(
file_ext='.svg',
)
TABLE_FEATS: Final[list[str]] = [
# NAME_DELTA_FEAT_TO_NEXT_FAILURE: Final[str] = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
TABLE_FEATS_OVERVIEW: Final[list[str]] = [
'ErstellungsDatum',
'ErledigungsDatum',
NAME_DELTA_FEAT_TO_REPAIR,
'VorgangsTypName',
'VorgangsBeschreibung',
'ErledigungsBeschreibung',
]
TABLE_FEATS_DATES: Final[list[str]] = [
'ErstellungsDatum',
'ErledigungsDatum',
]
TABLE_FEATS_BEST_ACTIONS: Final[list[str]] = [
'ErstellungsDatum',
'ErledigungsDatum',
'VorgangsTypName',
'VorgangsBeschreibung',
'ErledigungsBeschreibung',
NAME_DELTA_FEAT_TO_NEXT_FAILURE,
]
# ** figure config
MARKERS_OCCURRENCES: Final[dict[str, Any]] = {
@ -86,13 +115,15 @@ HOVER_DATA: Final[dict[str, Any]] = {
'ErstellungsDatum': '|%d.%m.%Y',
'ErledigungsDatum': '|%d.%m.%Y',
'VorgangsBeschreibung': True,
'ErledigungsBeschreibung': True,
}
HOVER_DATA_DELTA: Final[dict[str, Any]] = {
'ErstellungsDatum': '|%d.%m.%Y',
'ErledigungsDatum': '|%d.%m.%Y',
'VorgangsDatum': '|%d.%m.%Y',
'delta': True,
NAME_DELTA_FEAT_TO_REPAIR: True,
'VorgangsBeschreibung': True,
'ErledigungsBeschreibung': True,
}
# ** graph
@ -136,10 +167,10 @@ graph_layout = html.Div(
html.Img(
id='static-graph-img',
alt='static rendered graph',
# style={
# 'width': 'auto',
# 'height': 'auto',
# },
style={
'width': 'auto',
'height': 'auto',
},
),
html.P(id='info-graph-errors', children=[]),
],
@ -186,7 +217,27 @@ app.layout = html.Div(
]
),
html.Div(
[dash_table.DataTable(id='table-candidates')], style={'marginBottom': '2em'}
children=[
html.Div(
[
html.H5('Überblick ähnlicher Vorgänge'),
dash_table.DataTable(id='table-candidates'),
],
style={'paddingBottom': '1em'},
),
html.Div(
[
html.H5(
(
'Maßnahmen sortiert nach längstem Zeitraum '
'bis zum nächsten Ereignis'
)
),
dash_table.DataTable(id='table-best-actions'),
]
),
],
style={'marginBottom': '2em', 'padding': '2em'},
),
graph_layout,
],
@ -222,20 +273,21 @@ def update_choice_candidates(obj_id):
# ** helpers to filter DataFrame
def pre_filter_data(
def filter_candidates(
data: DataFrame,
idx: int,
obj_id: ObjectID,
) -> DataFrame:
) -> DataFrameTLFiltered:
# assert correct data type because of Dash
idx = int(idx)
obj_id = int(obj_id)
# data = data.copy()
cands_for_obj_id = cands[obj_id]
cands_choice = cands_for_obj_id[int(idx) - 1]
# data
data = data.loc[list(cands_choice)].sort_index() # type: ignore
data['delta'] = data['ErledigungsDatum'] - data['ErstellungsDatum']
data['delta'] = data['delta'].dt.days
data = filter_timeline_cands(
data=data,
cands=cands,
obj_id=obj_id,
entry_idx=(idx - 1), # idx in Dashboard starts with 1
)
return data
@ -258,10 +310,10 @@ def update_timeline(index, obj_id):
obj_text = texts[obj_id]
title_occurrences = f'HObjektText: {obj_text}'
title_delta = f'HObjektText: {obj_text}, Differenz Erstellung und Erledigung'
df = pre_filter_data(data, idx=index, obj_id=obj_id)
df = filter_candidates(data, idx=index, obj_id=obj_id)
# figure
fig_occurrences = fig_timeline_occurrences(df, title_occurrences)
fig_delta = fig_timeline_delta(df, title_delta)
fig_delta = fig_timeline_delta(df, title_delta, delta_feature=NAME_DELTA_FEAT_TO_REPAIR)
return fig_occurrences, fig_delta
@ -293,11 +345,12 @@ def fig_timeline_occurrences(
def fig_timeline_delta(
df: DataFrame,
title: str,
delta_feature: str,
) -> Figure:
fig = px.scatter(
data_frame=df,
x='ErstellungsDatum',
y='delta',
y=delta_feature,
title=title,
hover_data=HOVER_DATA_DELTA,
)
@ -309,25 +362,77 @@ def fig_timeline_delta(
return fig
def transform_to_HTML_table(
data: DataFrame,
target_features: Collection[str],
date_cols: Iterable[str] | None = None,
sorting_feature: str | None = None,
sorting_ascending: bool = True,
) -> tuple[HTMLColumns, HTMLTable]:
target_features = list(target_features)
data = data.copy()
data = data.filter(items=target_features, axis=1)
if sorting_feature is not None:
data = data.sort_values(by='ErstellungsDatum', ascending=sorting_ascending)
if date_cols is not None:
for col in date_cols:
data[col] = data[col].dt.strftime(r'%Y-%m-%d')
columns = [{'name': col, 'id': col} for col in data.columns]
table_data = data.to_dict('records')
return columns, table_data
# 'table-best-actions'
# ** HTML table
@callback(
[Output('table-candidates', 'data'), Output('table-candidates', 'columns')],
[
Output('table-candidates', 'columns'),
Output('table-candidates', 'data'),
Output('table-best-actions', 'columns'),
Output('table-best-actions', 'data'),
],
Input('selector-candidates', 'value'),
State('selector-obj_id', 'value'),
prevent_initial_call=True,
)
def update_table_candidates(index, obj_id):
df = pre_filter_data(data, idx=index, obj_id=obj_id)
df = df.filter(items=TABLE_FEATS, axis=1).sort_values(
by='ErstellungsDatum', ascending=True
def update_tables_candidates(
index,
obj_id,
) -> tuple[HTMLColumns, HTMLTable, HTMLColumns, HTMLTable]:
cands = filter_candidates(data, idx=index, obj_id=obj_id)
overview_cols, overview_table = transform_to_HTML_table(
data=cands,
target_features=TABLE_FEATS_OVERVIEW,
date_cols=TABLE_FEATS_DATES,
sorting_feature='ErstellungsDatum',
sorting_ascending=True,
)
cols = [{'name': i, 'id': i} for i in df.columns]
# convert dates to strings
for col in TABLE_FEATS_DATES:
df[col] = df[col].dt.strftime(r'%Y-%m-%d')
# df = df.filter(items=TABLE_FEATS_OVERVIEW, axis=1).sort_values(
# by='ErstellungsDatum', ascending=True
# )
# cols = [{'name': i, 'id': i} for i in df.columns]
# # convert dates to strings
# for col in TABLE_FEATS_DATES:
# df[col] = df[col].dt.strftime(r'%Y-%m-%d')
table_data = df.to_dict('records')
return table_data, cols
# table_data = df.to_dict('records')
cands_best_actions = calc_delta_to_next_failure(
data=cands,
date_feature='ErstellungsDatum',
name_delta_feature=NAME_DELTA_FEAT_TO_NEXT_FAILURE,
)
best_actions_cols, best_actions_table = transform_to_HTML_table(
data=cands_best_actions,
target_features=TABLE_FEATS_BEST_ACTIONS,
date_cols=TABLE_FEATS_DATES,
)
return overview_cols, overview_table, best_actions_cols, best_actions_table
# ** graph callbacks
@ -345,7 +450,7 @@ def update_table_candidates(index, obj_id):
def display_candidates_as_graph(index, obj_id):
error_msg = ''
t1 = time.perf_counter()
df = pre_filter_data(data, idx=index, obj_id=obj_id)
df = filter_candidates(data, idx=index, obj_id=obj_id)
t2 = time.perf_counter()
print(f'Time for filtering: {t2 - t1} s')

View File

@ -1,9 +1,8 @@
import re
from collections.abc import Iterable
from collections.abc import Collection
from itertools import combinations
from math import factorial
from pathlib import Path
from typing import Callable, cast
from typing import cast
import numpy as np
import pandas as pd
@ -25,11 +24,12 @@ from lang_main.loggers import logger_preprocess as logger
from lang_main.pipelines.base import Pipeline
from lang_main.types import Embedding, PandasIndex
# ** RE patterns
pattern_special_chars = re.compile(r'[\t\n\r\f\v]+')
pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
pattern_whitespace = re.compile(r'[ ]{2,}')
# TODO removal
# pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
# pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
# pattern_repeated_chars = re.compile(r'([,;.:!?-_\+]){2,}')
# pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
# pattern_whitespace = re.compile(r'[ ]{2,}')
# ** (1) dataset preparation: loading and simple preprocessing
@ -37,7 +37,7 @@ pattern_whitespace = re.compile(r'[ ]{2,}')
# duplicate cleansing based on all properties
def load_raw_data(
path: Path,
date_cols: Iterable[str] = (
date_cols: Collection[str] = (
'VorgangsDatum',
'ErledigungsDatum',
'Arbeitsbeginn',
@ -50,7 +50,7 @@ def load_raw_data(
----------
path : str
path to dataset file, usually CSV file
date_cols : list[str], optional
date_cols : Collection[str], optional
columns which contain dates and are parsed as such,
by default (
'VorgangsDatum',
@ -129,9 +129,7 @@ def remove_duplicates(
def remove_NA(
data: DataFrame,
target_features: list[str] = [
'VorgangsBeschreibung',
],
target_features: Collection[str] = ('VorgangsBeschreibung',),
) -> tuple[DataFrame]:
"""function to drop NA entries based on a subset of features to be analysed
@ -139,14 +137,15 @@ def remove_NA(
----------
data : DataFrame
standard IHM dataset, perhaps pre-cleaned
target_features : list[str], optional
subset to analyse to define an NA entry, by default [ 'VorgangsBeschreibung', ]
target_features : Collection[str], optional
subset to analyse to define an NA entry, by default ('VorgangsBeschreibung',)
Returns
-------
DataFrame
dataset with removed NA entries for given subset of features
"""
target_features = list(target_features)
wo_NA = data.dropna(axis=0, subset=target_features, ignore_index=True).copy() # type: ignore
logger.info(
f'Removed NA entries for features >>{target_features}<< from dataset successfully.'
@ -156,46 +155,7 @@ def remove_NA(
# ** (2) entry-based cleansing
# following functions clean and prepare specific entries, not whole dataset
def clean_string_slim(string: str) -> str:
"""mapping function to clean single string entries in a series (feature-wise)
of the dataset, used to be applied element-wise for string features
Parameters
----------
string : str
dataset entry feature
Returns
-------
str
cleaned entry
"""
# remove special chars
string = pattern_special_chars.sub(' ', string)
string = pattern_repeated_chars.sub(r'\1', string)
# string = pattern_dates.sub('', string)
# dates are used for context, should not be removed at this stage
string = pattern_whitespace.sub(' ', string)
# remove whitespaces at the beginning and the end
string = string.strip()
return string
def entry_wise_cleansing(
data: DataFrame,
target_feature: str,
cleansing_func: Callable[[str], str],
) -> tuple[DataFrame]:
# apply given cleansing function to target feature
data[target_feature] = data[target_feature].map(cleansing_func)
logger.info(
('Successfully applied entry-wise cleansing procedure >>%s<< for feature >>%s<<'),
cleansing_func.__name__,
target_feature,
)
return (data,)
# ** moved to module ``lang_main.analysis.shared``
# ** in-depth analysis of one feature

View File

@ -1,4 +1,5 @@
from collections.abc import Iterable, Iterator
import re
from collections.abc import Callable, Collection, Iterable, Iterator
from typing import cast
import networkx as nx
@ -7,14 +8,70 @@ import numpy.typing as npt
import sentence_transformers
import sentence_transformers.util
from networkx import Graph
from pandas import Series
from pandas import DataFrame, Series
from sentence_transformers import SentenceTransformer
from torch import Tensor
from lang_main.analysis.graphs import get_graph_metadata, update_graph
from lang_main.loggers import logger_preprocess as logger
from lang_main.types import PandasIndex
# ** RE patterns
pattern_escape_newline = re.compile(r'[\n]+')
pattern_escape_seq = re.compile(r'[\t\n\r\f\v]+')
pattern_escape_seq_sentences = re.compile(r' *[\t\n\r\f\v]+')
pattern_repeated_chars = re.compile(r'[,;.:!?\-_+]+(?=[,;.:!?\-_+])')
pattern_dates = re.compile(r'(\d{1,2}\.)?(\d{1,2}\.)([\d]{2,4})?')
pattern_whitespace = re.compile(r'[ ]{2,}')
# ** RE applications
# following functions clean and prepare specific entries, not whole datasets
def clean_string_slim(string: str) -> str:
"""mapping function to clean single string entries in a series (feature-wise)
of the dataset, used to be applied element-wise for string features
Parameters
----------
string : str
dataset entry feature
Returns
-------
str
cleaned entry
"""
# remove special chars
string = pattern_escape_newline.sub('. ', string)
string = pattern_escape_seq.sub(' ', string)
string = pattern_repeated_chars.sub('', string)
# string = pattern_dates.sub('', string)
# dates are used for context, should not be removed at this stage
string = pattern_whitespace.sub(' ', string)
# remove whitespaces at the beginning and the end
string = string.strip()
return string
# ** dataset cleansing
def entry_wise_cleansing(
data: DataFrame,
target_features: Collection[str],
cleansing_func: Callable[[str], str] = clean_string_slim,
) -> tuple[DataFrame]:
# apply given cleansing function to target feature
target_features = list(target_features)
data[target_features] = data[target_features].map(cleansing_func)
logger.info(
('Successfully applied entry-wise cleansing procedure >>%s<< for features >>%s<<'),
cleansing_func.__name__,
target_features,
)
return (data,)
# ** similarities
def candidates_by_index(
data_model_input: Series,
model: SentenceTransformer,

View File

@ -1,4 +1,4 @@
from collections.abc import Iterable, Iterator
from collections.abc import Collection, Iterable, Iterator
from typing import cast
from pandas import DataFrame, Series
@ -7,14 +7,21 @@ from tqdm.auto import tqdm # TODO: check deletion
from lang_main.analysis.shared import (
candidates_by_index,
entry_wise_cleansing,
pattern_escape_seq_sentences,
similar_index_connection_graph,
similar_index_groups,
)
from lang_main.loggers import logger_timeline as logger
from lang_main.types import ObjectID, PandasIndex, TimelineCandidates
from lang_main.types import (
DataFrameTLFiltered,
ObjectID,
PandasIndex,
TimelineCandidates,
)
def non_relevant_obj_ids(
def _non_relevant_obj_ids(
data: DataFrame,
thresh_unique_feat_per_id: int,
*,
@ -50,9 +57,9 @@ def remove_non_relevant_obj_ids(
feature_uniqueness: str = 'HObjektText',
feature_obj_id: str = 'ObjektID',
) -> tuple[DataFrame]:
logger.info('Removing non-relevant ObjectIDs from dataset')
logger.info('Removing non-relevant ObjectIDs from dataset...')
data = data.copy()
ids_to_ignore = non_relevant_obj_ids(
ids_to_ignore = _non_relevant_obj_ids(
data=data,
thresh_unique_feat_per_id=thresh_unique_feat_per_id,
feature_uniqueness=feature_uniqueness,
@ -61,7 +68,43 @@ def remove_non_relevant_obj_ids(
# only retain entries with ObjectIDs not in IDs to ignore
data = data.loc[~(data[feature_obj_id].isin(ids_to_ignore))]
logger.debug('Ignored ObjectIDs: %s', ids_to_ignore)
logger.info('Non-relevant ObjectIDs removed successfully')
logger.info('Non-relevant ObjectIDs removed successfully.')
return (data,)
def cleanup_descriptions(
data: DataFrame,
properties: Collection[str] = (
'VorgangsBeschreibung',
'ErledigungsBeschreibung',
),
) -> tuple[DataFrame]:
logger.info('Cleaning necessary descriptions...')
data = data.copy()
features = list(properties)
data[features] = data[features].fillna('N.V.')
(data,) = entry_wise_cleansing(data, target_features=features)
logger.info('Cleansing successful.')
return (data.copy(),)
def calc_delta_to_repair(
data: DataFrame,
date_feature_start: str = 'ErstellungsDatum',
date_feature_end: str = 'ErledigungsDatum',
name_delta_feature: str = 'delta_to_repair',
convert_to_days: bool = True,
) -> tuple[DataFrame]:
logger.info('Calculating time differences between start and end of operations...')
data = data.copy()
data[name_delta_feature] = data[date_feature_end] - data[date_feature_start]
if convert_to_days:
data[name_delta_feature] = data[name_delta_feature].dt.days
logger.info('Calculation successful.')
return (data,)
@ -75,7 +118,7 @@ def generate_model_input(
'VorgangsBeschreibung',
),
) -> tuple[DataFrame]:
logger.info('Generating concatenation of model input features')
logger.info('Generating concatenation of model input features...')
data = data.copy()
model_input_features = list(model_input_features)
input_features = data[model_input_features].fillna('').astype(str)
@ -83,7 +126,7 @@ def generate_model_input(
lambda x: ' - '.join(x),
axis=1,
)
logger.info('Model input generated successfully')
logger.info('Model input generated successfully.')
return (data,)
@ -97,7 +140,7 @@ def filter_activities_per_obj_id(
) -> tuple[DataFrame, Series]:
data = data.copy()
# filter only relevant activities count occurrences for each ObjectID
logger.info('Filtering activities per ObjectID')
logger.info('Filtering activities per ObjectID...')
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
data_filter_activities = data.loc[filt_rel_activities].copy()
num_activities_per_obj_id = cast(
@ -113,7 +156,7 @@ def filter_activities_per_obj_id(
num_activities_per_obj_id = num_activities_per_obj_id.loc[~filt_below_thresh]
data_filter_activities = data_filter_activities.loc[~filt_entries_below_thresh]
logger.info('Activities per ObjectID filtered successfully')
logger.info('Activities per ObjectID filtered successfully.')
return data_filter_activities, num_activities_per_obj_id
@ -129,7 +172,7 @@ def filter_activities_per_obj_id(
## use idx pairs to get idx values of series
def get_timeline_candidates_index(
def _get_timeline_candidates_index(
data: DataFrame,
num_activities_per_obj_id: Series,
*,
@ -161,7 +204,7 @@ def get_timeline_candidates_index(
# TODO: check application for duplicate removal
def transform_timeline_candidates(
def _transform_timeline_candidates(
candidates: Iterator[tuple[ObjectID, tuple[PandasIndex, ...]]],
) -> TimelineCandidates:
"""function to build a mapping of ObjectIDs to their respective collection of
@ -200,7 +243,7 @@ def transform_timeline_candidates(
return candidates_by_obj_id
def map_obj_id_to_texts(
def _map_obj_id_to_texts(
data: DataFrame,
feature_obj_id: str = 'ObjektID',
) -> dict[ObjectID, str]:
@ -229,7 +272,7 @@ def get_timeline_candidates(
model_input_feature: str = 'nlp_model_input',
) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
logger.info('Obtaining timeline candidates...')
candidates = get_timeline_candidates_index(
candidates = _get_timeline_candidates_index(
data=data,
num_activities_per_obj_id=num_activities_per_obj_id,
model=model,
@ -237,14 +280,52 @@ def get_timeline_candidates(
feature_obj_id=feature_obj_id,
model_input_feature=model_input_feature,
)
tl_candidates = transform_timeline_candidates(candidates)
tl_candidates = _transform_timeline_candidates(candidates)
logger.info('Timeline candidates obtained successfully.')
# text mapping to obtain object descriptors
logger.info('Mapping ObjectIDs to their respective text descriptor...')
map_obj_text = map_obj_id_to_texts(
map_obj_text = _map_obj_id_to_texts(
data=data,
feature_obj_id=feature_obj_id,
)
logger.info('ObjectIDs successfully mapped to text descriptors.')
return tl_candidates, map_obj_text
# ** Postprocessing
# filter original dataset for a batch of timeline candidates
def filter_timeline_cands(
data: DataFrame,
cands: TimelineCandidates,
obj_id: ObjectID,
entry_idx: int,
sort_feature: str = 'ErstellungsDatum',
) -> DataFrameTLFiltered:
data = data.copy()
cands_for_obj_id = cands[obj_id]
cands_choice = cands_for_obj_id[entry_idx]
data = data.loc[list(cands_choice)].sort_values(
by=sort_feature,
ascending=True,
)
return data
def calc_delta_to_next_failure(
data: DataFrameTLFiltered,
date_feature: str = 'ErstellungsDatum',
name_delta_feature: str = 'delta_to_next_failure',
convert_to_days: bool = True,
) -> DataFrameTLFiltered:
data = data.copy()
last_val = data[date_feature].iat[-1]
shifted = data[date_feature].shift(-1, fill_value=last_val)
data[name_delta_feature] = shifted - data[date_feature]
data = data.sort_values(by=name_delta_feature, ascending=False)
if convert_to_days:
data[name_delta_feature] = data[name_delta_feature].dt.days
return data

View File

@ -76,13 +76,14 @@ CYTO_LAYOUT_PROPERTIES: Final[CytoLayoutProperties] = {
'isDeterministic': True,
'singlePartition': False,
}
CYTO_SANDBOX_NAME: Final[str] = 'lang_main'
CYTO_STYLESHEET_NAME: Final[str] = 'lang_main'
# name for property, on which selection is done
CYTO_SELECTION_PROPERTY: Final[str] = 'node_selection'
CYTO_NUMBER_SUBGRAPHS: Final[int] = 5
CYTO_ITER_NEIGHBOUR_DEPTH: Final[int] = 2
# ** time analysis.uniqueness
# ** time_analysis.uniqueness
THRESHOLD_UNIQUE_TEXTS: Final[int] = CONFIG['time_analysis']['uniqueness'][
'threshold_unique_texts'
]
@ -90,6 +91,10 @@ UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
'criterion_feature'
]
FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
# ** time_analysis.preparation
NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'delta_to_repair'
# NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'Zeitspanne bis zur Behebung [Tage]'
NAME_DELTA_FEAT_TO_NEXT_FAILURE: Final[str] = 'Zeitspanne bis zum nächsten Ereignis [Tage]'
# ** time_analysis.model_input
MODEL_INPUT_FEATURES: Final[tuple[str, ...]] = tuple(
CONFIG['time_analysis']['model_input']['input_features']

View File

@ -3,15 +3,19 @@ from pathlib import Path
from lang_main.analysis import graphs
from lang_main.analysis.preprocessing import (
analyse_feature,
clean_string_slim,
entry_wise_cleansing,
load_raw_data,
merge_similarity_dupl,
numeric_pre_filter_feature,
remove_duplicates,
remove_NA,
)
from lang_main.analysis.shared import (
clean_string_slim,
entry_wise_cleansing,
)
from lang_main.analysis.timeline import (
calc_delta_to_repair,
cleanup_descriptions,
filter_activities_per_obj_id,
generate_model_input,
get_timeline_candidates,
@ -25,6 +29,7 @@ from lang_main.constants import (
DATE_COLS,
FEATURE_NAME_OBJ_ID,
MODEL_INPUT_FEATURES,
NAME_DELTA_FEAT_TO_REPAIR,
SAVE_PATH_FOLDER,
SPCY_MODEL,
STFR_MODEL,
@ -56,7 +61,7 @@ def build_base_target_feature_pipe() -> Pipeline:
pipe_target_feat.add(
entry_wise_cleansing,
{
'target_feature': 'VorgangsBeschreibung',
'target_feature': ('VorgangsBeschreibung',),
'cleansing_func': clean_string_slim,
},
save_result=True,
@ -182,7 +187,6 @@ def build_tk_graph_rescaling_pipe(
graphs.pipe_add_graph_metrics,
save_result=save_result,
filename=exit_point,
# filename=EntryPoints.TK_GRAPH_ANALYSIS_RESCALED,
)
return pipe_graph_rescaling
@ -247,6 +251,23 @@ def build_tk_graph_render_pipe(
# ** timeline analysis
def build_timeline_pipe() -> Pipeline:
pipe_timeline = Pipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
pipe_timeline.add(
cleanup_descriptions,
{
'properties': ['ErledigungsBeschreibung'],
},
)
pipe_timeline.add(
calc_delta_to_repair,
{
'date_feature_start': 'ErstellungsDatum',
'date_feature_end': 'ErledigungsDatum',
'name_delta_feature': NAME_DELTA_FEAT_TO_REPAIR,
'convert_to_days': True,
},
save_result=True,
filename=EntryPoints.TIMELINE_POST,
)
pipe_timeline.add(
remove_non_relevant_obj_ids,
{
@ -281,7 +302,7 @@ def build_timeline_pipe() -> Pipeline:
'model_input_feature': 'nlp_model_input',
},
save_result=True,
filename=EntryPoints.TIMELINE_POST,
filename=EntryPoints.TIMELINE_CANDS,
)
return pipe_timeline

View File

@ -16,6 +16,7 @@ from lang_main.constants import (
CYTO_LAYOUT_PROPERTIES,
CYTO_NUMBER_SUBGRAPHS,
CYTO_PATH_STYLESHEET,
CYTO_SANDBOX_NAME,
CYTO_SELECTION_PROPERTY,
CYTO_STYLESHEET_NAME,
PROPERTY_NAME_DEGREE_WEIGHTED,
@ -56,6 +57,8 @@ def verify_connection():
def import_to_cytoscape(
graph: DiGraph | Graph,
network_name: str = CYTO_BASE_NETWORK_NAME,
sandbox_name: str = CYTO_SANDBOX_NAME,
reinitialise_sandbox: bool = True,
) -> None:
"""Cytoscape: import NetworkX graph as new network collection
@ -66,6 +69,12 @@ def import_to_cytoscape(
"""
logger.debug('Checking Cytoscape connection...')
verify_connection()
logger.debug('Setting Cytoscape sandbox...')
p4c.sandbox_set(
sandbox_name=sandbox_name,
reinitialize=reinitialise_sandbox,
copy_samples=False,
)
logger.debug('Importing to and analysing network in Cytoscape...')
p4c.delete_all_networks()
p4c.create_network_from_networkx(
@ -122,6 +131,7 @@ def export_network_to_image(
filetype: CytoExportFileTypes = 'SVG',
network_name: str = CYTO_BASE_NETWORK_NAME,
pdf_export_page_size: CytoExportPageSizes = 'A4',
sandbox_name: str = CYTO_SANDBOX_NAME,
) -> None:
"""Cytoscape: export current selected view as image
@ -140,14 +150,17 @@ def export_network_to_image(
logger.debug('Exporting image to file...')
if not target_folder.exists():
target_folder.mkdir(parents=True)
file_pth = target_folder / filename
dst_file_pth = (target_folder / filename).with_suffix(f'.{filetype.lower()}')
text_as_font = True
if filetype == 'SVG':
text_as_font = False
# image is generated in sandbox directory and transferred to target destination
# (preparation for remote instances of Cytoscape)
# TODO close non-necessary windows before image display
p4c.export_image(
filename=str(file_pth),
filename=filename,
type=filetype,
network=network_name,
overwrite_file=True,
@ -155,7 +168,24 @@ def export_network_to_image(
export_text_as_font=text_as_font,
page_size=pdf_export_page_size,
)
logger.debug('Exporting image to file successful.')
# TODO change back to Cytoscape 3.10 and above
# TODO remove if Cytoscape >= 3.10.* is running in container
# p4c.export_image(
# filename=filename,
# type=filetype,
# network=network_name,
# overwrite_file=True,
# )
logger.debug('Exported image to sandbox.')
logger.debug('Transferring image from sandbox to target destination...')
sandbox_filename = f'{filename}.{filetype.lower()}'
p4c.sandbox_get_from(
source_file=sandbox_filename,
dest_file=str(dst_file_pth),
overwrite=True,
sandbox_name=sandbox_name,
)
logger.debug('Transfer of image from sandbox to target destination successful.')
def layout_network(
@ -192,6 +222,7 @@ def apply_style_to_network(
node_size_property: str = 'node_selection',
min_node_size: int = 15,
max_node_size: int = 40,
sandbox_name: str = CYTO_SANDBOX_NAME,
) -> None:
"""Cytoscape: apply a chosen Cytoscape style to the defined network
@ -221,7 +252,16 @@ def apply_style_to_network(
raise FileNotFoundError(
f'Visual stylesheet for Cytoscape not found under: >>{pth_to_stylesheet}<<'
)
p4c.import_visual_styles(str(pth_to_stylesheet))
# send to sandbox
sandbox_filename = pth_to_stylesheet.name
p4c.sandbox_send_to(
source_file=pth_to_stylesheet,
dest_file=sandbox_filename,
overwrite=True,
sandbox_name=sandbox_name,
)
# load stylesheet
p4c.import_visual_styles(sandbox_filename)
p4c.set_visual_style(style_name, network=network_name)
# node size mapping, only if needed property is available
@ -242,6 +282,7 @@ def apply_style_to_network(
default_number=min_node_size,
)
p4c.set_node_size_mapping(**node_size_map)
# TODO removal
# else:
# node_table = p4c.get_table_columns(table='node', network=network_name)
# nodes_SUID = node_table['SUID'].to_list()

View File

@ -1,5 +1,7 @@
import enum
from collections.abc import Hashable
from typing import (
Any,
Literal,
Required,
TypeAlias,
@ -7,6 +9,7 @@ from typing import (
)
import numpy as np
from pandas import DataFrame
from spacy.tokens.doc import Doc as SpacyDoc
from torch import Tensor
@ -33,6 +36,7 @@ ResultHandling: TypeAlias = list[tuple[bool, str | None]]
class EntryPoints(enum.StrEnum):
TIMELINE = 'TIMELINE'
TIMELINE_POST = 'TIMELINE_POSTPROCESSING'
TIMELINE_CANDS = 'TIMELINE_CANDIDATES'
TIMELINE_TK_GRAPH_RESCALED = 'TIMELINE_TK_GRAPH_RESCALED'
TK_GRAPH_POST = 'TK-GRAPH_POSTPROCESSING'
TK_GRAPH_ANALYSIS = 'TK-GRAPH_ANALYSIS'
@ -44,6 +48,7 @@ class EntryPoints(enum.StrEnum):
PandasIndex: TypeAlias = int | np.int64
ObjectID: TypeAlias = int
Embedding: TypeAlias = SpacyDoc | Tensor
DataFrameTLFiltered: TypeAlias = DataFrame
# ** graphs
NodeTitle: TypeAlias = str
@ -118,3 +123,8 @@ class CytoscapeData(TypedDict, total=False):
# ** timeline
TimelineCandidates: TypeAlias = dict[ObjectID, tuple[tuple[PandasIndex, ...], ...]]
# ** Dash (Dashboard)
HTMLTable: TypeAlias = list[dict[Hashable, Any]]
HTMLColumns: TypeAlias = list[dict[str, str]]

File diff suppressed because one or more lines are too long