Module lang_main.pipelines.predefined

Functions

def build_base_target_feature_pipe() ‑> Pipeline
Expand source code
def build_base_target_feature_pipe() -> Pipeline:
    pipe_target_feat = Pipeline(name='Target_Feature', working_dir=SAVE_PATH_FOLDER)
    pipe_target_feat.add(
        load_raw_data,
        {
            'date_cols': DATE_COLS,
        },
    )
    pipe_target_feat.add(remove_duplicates)
    pipe_target_feat.add(remove_NA, save_result=True)
    pipe_target_feat.add(
        entry_wise_cleansing,
        {
            'target_features': (TARGET_FEATURE,),
            'cleansing_func': clean_string_slim,
        },
        save_result=True,
        filename=EntryPoints.TIMELINE,
    )
    pipe_target_feat.add(
        analyse_feature,
        {
            'target_feature': TARGET_FEATURE,
        },
        save_result=True,
    )

    return pipe_target_feat
def build_merge_duplicates_pipe() ‑> Pipeline
Expand source code
def build_merge_duplicates_pipe() -> Pipeline:
    pipe_merge = Pipeline(name='Merge_Duplicates', working_dir=SAVE_PATH_FOLDER)
    pipe_merge.add(
        numeric_pre_filter_feature,
        {
            'feature': 'len',
            'bound_lower': THRESHOLD_AMOUNT_CHARACTERS,
            'bound_upper': None,
        },
    )
    pipe_merge.add(
        merge_similarity_duplicates,
        {
            'model': STFR_MODEL,
            'cos_sim_threshold': THRESHOLD_SIMILARITY,
        },
        save_result=True,
        filename=EntryPoints.TOKEN_ANALYSIS,
    )

    return pipe_merge
def build_timeline_pipe() ‑> Pipeline
Expand source code
def build_timeline_pipe() -> Pipeline:
    pipe_timeline = Pipeline(name='Timeline_Analysis', working_dir=SAVE_PATH_FOLDER)
    pipe_timeline.add(
        cleanup_descriptions,
        {
            'properties': ['ErledigungsBeschreibung'],
        },
    )
    pipe_timeline.add(
        calc_delta_to_repair,
        {
            'date_feature_start': 'ErstellungsDatum',
            'date_feature_end': 'ErledigungsDatum',
            'name_delta_feature': NAME_DELTA_FEAT_TO_REPAIR,
            'convert_to_days': True,
        },
        save_result=True,
        filename=EntryPoints.TIMELINE_POST,
    )
    pipe_timeline.add(
        remove_non_relevant_obj_ids,
        {
            'thresh_unique_feat_per_id': THRESHOLD_UNIQUE_TEXTS,
            'feature_uniqueness': UNIQUE_CRITERION_FEATURE,
            'feature_obj_id': FEATURE_NAME_OBJ_ID,
        },
        save_result=True,
    )
    pipe_timeline.add(
        generate_model_input,
        {
            'target_feature_name': 'nlp_model_input',
            'model_input_features': MODEL_INPUT_FEATURES,
        },
    )
    pipe_timeline.add(
        filter_activities_per_obj_id,
        {
            'activity_feature': ACTIVITY_FEATURE,
            'relevant_activity_types': ACTIVITY_TYPES,
            'feature_obj_id': FEATURE_NAME_OBJ_ID,
            'threshold_num_activities': THRESHOLD_NUM_ACTIVITIES,
        },
    )
    pipe_timeline.add(
        get_timeline_candidates,
        {
            'model': STFR_MODEL,
            'cos_sim_threshold': THRESHOLD_TIMELINE_SIMILARITY,
            'feature_obj_id': FEATURE_NAME_OBJ_ID,
            'feature_obj_text': FEATURE_NAME_OBJ_TEXT,
            'model_input_feature': 'nlp_model_input',
        },
        save_result=True,
        filename=EntryPoints.TIMELINE_CANDS,
    )

    return pipe_timeline
def build_tk_graph_pipe() ‑> Pipeline
Expand source code
def build_tk_graph_pipe() -> Pipeline:
    pipe_token_analysis = Pipeline(name='Token_Analysis', working_dir=SAVE_PATH_FOLDER)
    pipe_token_analysis.add(
        build_token_graph,
        {
            'model': SPACY_MODEL,
            'target_feature': 'entry',
            'weights_feature': 'num_occur',
            'batch_idx_feature': 'batched_idxs',
            'build_map': False,
            'batch_size_model': 50,
        },
        save_result=True,
        filename=EntryPoints.TK_GRAPH_POST,
    )

    return pipe_token_analysis
def build_tk_graph_post_pipe() ‑> Pipeline
Expand source code
def build_tk_graph_post_pipe() -> Pipeline:
    pipe_graph_postprocessing = Pipeline(
        name='Graph_Postprocessing', working_dir=SAVE_PATH_FOLDER
    )
    pipe_graph_postprocessing.add(
        graphs.filter_graph_by_number_edges,
        {
            'limit': MAX_EDGE_NUMBER,
            'property': 'weight',
        },
    )
    pipe_graph_postprocessing.add(
        graphs.filter_graph_by_node_degree,
        {
            'bound_lower': 1,
            'bound_upper': None,
        },
    )
    pipe_graph_postprocessing.add(
        graphs.static_graph_analysis,
        save_result=True,
        filename=EntryPoints.TK_GRAPH_ANALYSIS,
    )

    return pipe_graph_postprocessing
def build_tk_graph_render_pipe(with_subgraphs: bool,
export_folder: pathlib.Path = WindowsPath('A:/Arbeitsaufgaben/lang-data/out'),
base_network_name: str = 'token_graph') ‑> Pipeline
Expand source code
def build_tk_graph_render_pipe(
    with_subgraphs: bool,
    export_folder: Path = SAVE_PATH_FOLDER,
    base_network_name: str = CYTO_BASE_NETWORK_NAME,
) -> Pipeline:
    # optional dependency: late import
    # raises exception if necessary modules are not found
    try:
        from lang_main.render import cytoscape as cyto
    except ImportError:
        raise ImportError(
            (
                'Dependencies for Cytoscape interaction not found.'
                'Install package with optional dependencies.'
            )
        )

    pipe_graph_rendering = Pipeline(
        name='Graph_Static-Rendering',
        working_dir=SAVE_PATH_FOLDER,
    )
    pipe_graph_rendering.add(
        cyto.import_to_cytoscape,
        {
            'network_name': base_network_name,
        },
    )
    pipe_graph_rendering.add(
        cyto.layout_network,
        {
            'network_name': base_network_name,
        },
    )
    pipe_graph_rendering.add(
        cyto.apply_style_to_network,
        {
            'network_name': base_network_name,
        },
    )
    pipe_graph_rendering.add(
        cyto.export_network_to_image,
        {
            'filename': base_network_name,
            'target_folder': export_folder,
            'network_name': base_network_name,
        },
    )

    if with_subgraphs:
        pipe_graph_rendering.add(
            cyto.get_subgraph_node_selection,
            {
                'network_name': base_network_name,
            },
        )
        pipe_graph_rendering.add(
            cyto.build_subnetworks,
            {
                'export_image': True,
                'target_folder': export_folder,
                'network_name': base_network_name,
            },
        )

    return pipe_graph_rendering
def build_tk_graph_rescaling_pipe(save_result: bool, exit_point: lang_main.types.EntryPoints) ‑> Pipeline
Expand source code
def build_tk_graph_rescaling_pipe(
    save_result: bool,
    exit_point: EntryPoints,
) -> Pipeline:
    pipe_graph_rescaling = Pipeline(name='Graph_Rescaling', working_dir=SAVE_PATH_FOLDER)
    pipe_graph_rescaling.add(
        graphs.pipe_rescale_graph_edge_weights,
    )
    pipe_graph_rescaling.add(
        graphs.pipe_add_graph_metrics,
        save_result=save_result,
        filename=exit_point,
    )

    return pipe_graph_rescaling