added test cases
This commit is contained in:
@@ -112,8 +112,6 @@ def candidates_by_index(
|
||||
)
|
||||
# cosine similarity
|
||||
cos_sim = cast(npt.NDArray, model.similarity(embds, embds).numpy())
|
||||
# TODO check removal
|
||||
# cos_sim = cast(npt.NDArray, sentence_transformers.util.cos_sim(embds, embds).numpy())
|
||||
np.fill_diagonal(cos_sim, 0.0)
|
||||
cos_sim = np.triu(cos_sim)
|
||||
cos_sim_idx = np.argwhere(cos_sim >= cos_sim_threshold)
|
||||
|
||||
@@ -142,7 +142,7 @@ def filter_activities_per_obj_id(
|
||||
threshold_num_activities: int = 1,
|
||||
) -> tuple[DataFrame, Series]:
|
||||
data = data.copy()
|
||||
# filter only relevant activities count occurrences for each ObjectID
|
||||
# filter only relevant activities, count occurrences for each ObjectID
|
||||
logger.info('Filtering activities per ObjectID...')
|
||||
filt_rel_activities = data[activity_feature].isin(relevant_activity_types)
|
||||
data_filter_activities = data.loc[filt_rel_activities].copy()
|
||||
@@ -249,6 +249,7 @@ def _transform_timeline_candidates(
|
||||
def _map_obj_id_to_texts(
|
||||
data: DataFrame,
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
feature_obj_text: str = 'HObjektText',
|
||||
) -> dict[ObjectID, str]:
|
||||
data = data.copy()
|
||||
obj_ids = cast(Iterable[ObjectID], data[feature_obj_id].unique())
|
||||
@@ -256,9 +257,9 @@ def _map_obj_id_to_texts(
|
||||
obj_id_to_text: dict[ObjectID, str] = {}
|
||||
|
||||
for obj_id in tqdm(obj_ids):
|
||||
data_per_obj = cast(DataFrame, data.loc[data['ObjektID'] == obj_id])
|
||||
data_per_obj = cast(DataFrame, data.loc[data[feature_obj_id] == obj_id])
|
||||
# just take first entry
|
||||
obj_text = cast(str, data_per_obj['HObjektText'].dropna().iat[0])
|
||||
obj_text = cast(str, data_per_obj[feature_obj_text].dropna().iat[0])
|
||||
obj_text = obj_text.strip(r' ,.:')
|
||||
obj_id_to_text[obj_id] = obj_text
|
||||
|
||||
@@ -272,6 +273,7 @@ def get_timeline_candidates(
|
||||
model: SentenceTransformer,
|
||||
cos_sim_threshold: float,
|
||||
feature_obj_id: str = 'ObjektID',
|
||||
feature_obj_text: str = 'HObjektText',
|
||||
model_input_feature: str = 'nlp_model_input',
|
||||
) -> tuple[TimelineCandidates, dict[ObjectID, str]]:
|
||||
logger.info('Obtaining timeline candidates...')
|
||||
@@ -290,6 +292,7 @@ def get_timeline_candidates(
|
||||
map_obj_text = _map_obj_id_to_texts(
|
||||
data=data,
|
||||
feature_obj_id=feature_obj_id,
|
||||
feature_obj_text=feature_obj_text,
|
||||
)
|
||||
logger.info('ObjectIDs successfully mapped to text descriptors.')
|
||||
|
||||
|
||||
@@ -145,6 +145,9 @@ UNIQUE_CRITERION_FEATURE: Final[str] = CONFIG['time_analysis']['uniqueness'][
|
||||
'criterion_feature'
|
||||
]
|
||||
FEATURE_NAME_OBJ_ID: Final[str] = CONFIG['time_analysis']['uniqueness']['feature_name_obj_id']
|
||||
FEATURE_NAME_OBJ_TEXT: Final[str] = CONFIG['time_analysis']['uniqueness'][
|
||||
'feature_name_obj_text'
|
||||
]
|
||||
# ** time_analysis.preparation
|
||||
# NAME_DELTA_FEAT_TO_REPAIR: Final[str] = 'delta_to_repair'
|
||||
CONFIG['time_analysis']['preparation']['name_delta_feat_to_repair']
|
||||
|
||||
@@ -42,6 +42,7 @@ threshold_edge_number = 330
|
||||
threshold_unique_texts = 4
|
||||
criterion_feature = 'HObjektText'
|
||||
feature_name_obj_id = 'ObjektID'
|
||||
feature_name_obj_text = 'HObjektText'
|
||||
|
||||
[time_analysis.preparation]
|
||||
name_delta_feat_to_repair = 'Zeitspanne bis zur Behebung [Tage]'
|
||||
|
||||
@@ -29,6 +29,7 @@ from lang_main.constants import (
|
||||
CYTO_BASE_NETWORK_NAME,
|
||||
DATE_COLS,
|
||||
FEATURE_NAME_OBJ_ID,
|
||||
FEATURE_NAME_OBJ_TEXT,
|
||||
MODEL_INPUT_FEATURES,
|
||||
NAME_DELTA_FEAT_TO_REPAIR,
|
||||
SAVE_PATH_FOLDER,
|
||||
@@ -287,6 +288,7 @@ def build_timeline_pipe() -> Pipeline:
|
||||
'model': STFR_MODEL,
|
||||
'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
|
||||
'feature_obj_id': FEATURE_NAME_OBJ_ID,
|
||||
'feature_obj_text': FEATURE_NAME_OBJ_TEXT,
|
||||
'model_input_feature': 'nlp_model_input',
|
||||
},
|
||||
save_result=True,
|
||||
|
||||
Reference in New Issue
Block a user