Module lang_main.pipelines.predefined
Functions
def build_base_target_feature_pipe() ‑> Pipeline-
Expand source code
def build_base_target_feature_pipe() -> Pipeline: pipe_target_feat = Pipeline(name='Target_Feature', working_dir=SAVE_PATH_FOLDER) pipe_target_feat.add( load_raw_data, { 'date_cols': DATE_COLS, }, ) pipe_target_feat.add(remove_duplicates) pipe_target_feat.add(remove_NA, save_result=True) pipe_target_feat.add( entry_wise_cleansing, { 'target_features': (TARGET_FEATURE,), 'cleansing_func': clean_string_slim, }, save_result=True, filename=EntryPoints.TIMELINE, ) pipe_target_feat.add( analyse_feature, { 'target_feature': TARGET_FEATURE, }, save_result=True, ) return pipe_target_feat def build_merge_duplicates_pipe() ‑> Pipeline-
Expand source code
def build_merge_duplicates_pipe() -> Pipeline: pipe_merge = Pipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER) pipe_merge.add( numeric_pre_filter_feature, { 'feature': 'len', 'bound_lower': THRESHOLD_AMOUNT_CHARACTERS, 'bound_upper': None, }, ) pipe_merge.add( merge_similarity_duplicates, { 'model': STFR_MODEL, 'cos_sim_threshold': THRESHOLD_SIMILARITY, }, save_result=True, filename=EntryPoints.TOKEN_ANALYSIS, ) return pipe_merge def build_timeline_pipe() ‑> Pipeline-
Expand source code
def build_timeline_pipe() -> Pipeline: pipe_timeline = Pipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER) pipe_timeline.add( cleanup_descriptions, { 'properties': ['ErledigungsBeschreibung'], }, ) pipe_timeline.add( calc_delta_to_repair, { 'date_feature_start': 'ErstellungsDatum', 'date_feature_end': 'ErledigungsDatum', 'name_delta_feature': NAME_DELTA_FEAT_TO_REPAIR, 'convert_to_days': True, }, save_result=True, filename=EntryPoints.TIMELINE_POST, ) pipe_timeline.add( remove_non_relevant_obj_ids, { 'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS, 'feature_uniqueness': UNIQUE_CRITERION_FEATURE, 'feature_obj_id': FEATURE_NAME_OBJ_ID, }, save_result=True, ) pipe_timeline.add( generate_model_input, { 'target_feature_name': 'nlp_model_input', 'model_input_features': MODEL_INPUT_FEATURES, }, ) pipe_timeline.add( filter_activities_per_obj_id, { 'activity_feature': ACTIVITY_FEATURE, 'relevant_activity_types': ACTIVITY_TYPES, 'feature_obj_id': FEATURE_NAME_OBJ_ID, 'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES, }, ) pipe_timeline.add( get_timeline_candidates, { 'model': STFR_MODEL, 'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY, 'feature_obj_id': FEATURE_NAME_OBJ_ID, 'feature_obj_text': FEATURE_NAME_OBJ_TEXT, 'model_input_feature': 'nlp_model_input', }, save_result=True, filename=EntryPoints.TIMELINE_CANDS, ) return pipe_timeline def build_tk_graph_pipe() ‑> Pipeline-
Expand source code
def build_tk_graph_pipe() -> Pipeline: pipe_token_analysis = Pipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER) pipe_token_analysis.add( build_token_graph, { 'model': SPACY_MODEL, 'target_feature': 'entry', 'weights_feature': 'num_occur', 'batch_idx_feature': 'batched_idxs', 'build_map': False, 'batch_size_model': 50, }, save_result=True, filename=EntryPoints.TK_GRAPH_POST, ) return pipe_token_analysis def build_tk_graph_post_pipe() ‑> Pipeline-
Expand source code
def build_tk_graph_post_pipe() -> Pipeline: pipe_graph_postprocessing = Pipeline( name='Graph_Postprocessing', working_dir=SAVE_PATH_FOLDER ) pipe_graph_postprocessing.add( graphs.filter_graph_by_number_edges, { 'limit': MAX_EDGE_NUMBER, 'property': 'weight', }, ) pipe_graph_postprocessing.add( graphs.filter_graph_by_node_degree, { 'bound_lower': 1, 'bound_upper': None, }, ) pipe_graph_postprocessing.add( graphs.static_graph_analysis, save_result=True, filename=EntryPoints.TK_GRAPH_ANALYSIS, ) return pipe_graph_postprocessing def build_tk_graph_render_pipe(with_subgraphs: bool,
export_folder: pathlib.Path = WindowsPath('A:/Arbeitsaufgaben/lang-data/out'),
base_network_name: str = 'token_graph') ‑> Pipeline-
Expand source code
def build_tk_graph_render_pipe( with_subgraphs: bool, export_folder: Path = SAVE_PATH_FOLDER, base_network_name: str = CYTO_BASE_NETWORK_NAME, ) -> Pipeline: # optional dependency: late import # raises exception if necessary modules are not found try: from lang_main.render import cytoscape as cyto except ImportError: raise ImportError( ( 'Dependencies for Cytoscape interaction not found.' 'Install package with optional dependencies.' ) ) pipe_graph_rendering = Pipeline( name='Graph_Static-Rendering', working_dir=SAVE_PATH_FOLDER, ) pipe_graph_rendering.add( cyto.import_to_cytoscape, { 'network_name': base_network_name, }, ) pipe_graph_rendering.add( cyto.layout_network, { 'network_name': base_network_name, }, ) pipe_graph_rendering.add( cyto.apply_style_to_network, { 'network_name': base_network_name, }, ) pipe_graph_rendering.add( cyto.export_network_to_image, { 'filename': base_network_name, 'target_folder': export_folder, 'network_name': base_network_name, }, ) if with_subgraphs: pipe_graph_rendering.add( cyto.get_subgraph_node_selection, { 'network_name': base_network_name, }, ) pipe_graph_rendering.add( cyto.build_subnetworks, { 'export_image': True, 'target_folder': export_folder, 'network_name': base_network_name, }, ) return pipe_graph_rendering def build_tk_graph_rescaling_pipe(save_result: bool, exit_point: lang_main.types.EntryPoints) ‑> Pipeline-
Expand source code
def build_tk_graph_rescaling_pipe( save_result: bool, exit_point: EntryPoints, ) -> Pipeline: pipe_graph_rescaling = Pipeline(name='Graph_Rescaling', working_dir=SAVE_PATH_FOLDER) pipe_graph_rescaling.add( graphs.pipe_rescale_graph_edge_weights, ) pipe_graph_rescaling.add( graphs.pipe_add_graph_metrics, save_result=save_result, filename=exit_point, ) return pipe_graph_rescaling