diff --git a/lang_main_config.toml b/lang_main_config.toml index c3d3e6e..0935eee 100644 --- a/lang_main_config.toml +++ b/lang_main_config.toml @@ -1,27 +1,24 @@ # lang_main: Config file -[info] -pkg = 'lang_main' [paths] -inputs = './inputs/' +inputs = './data/' # results = './results/dummy_N_1000/' # dataset = '../data/Dummy_Dataset_N_1000.csv' -results = './results/test_20240807/' -dataset = '../data/02_202307/Export4.csv' +results = './data/' +models = '../lang-models' [logging] enabled = true stderr = true file = true -# only debugging features, production-ready pipelines should always -# be fully executed +# control which pipelines are executed [control] -preprocessing_skip = true +preprocessing_skip = false token_analysis_skip = false graph_postprocessing_skip = false graph_rescaling_skip = false -graph_static_rendering_skip = false +graph_static_rendering_skip = true time_analysis_skip = true [preprocess] diff --git a/notebooks/misc.ipynb b/notebooks/misc.ipynb index b354b50..9ca6c24 100644 --- a/notebooks/misc.ipynb +++ b/notebooks/misc.ipynb @@ -155,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 1, "id": "0a48d11d-1f2b-475e-9ddf-bb9a3f67accb", "metadata": {}, "outputs": [], @@ -165,53 +165,471 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 2, "id": "e340377a-0df4-44ca-b18e-8b354e273eb9", "metadata": {}, "outputs": [], "source": [ - "save_pth = Path.cwd() / 'test.graphml'" + "save_pth = Path.cwd() / 'tk_graph_built.pkl'\n", + "pth_export = Path.cwd() / 'tk_graph_built'\n", + "assert save_pth.exists()" ] }, { "cell_type": "code", - "execution_count": 82, - "id": "66677ad0-a1e5-4772-a0ba-7fbeeda55297", + "execution_count": 3, + "id": "8aba87b2-e924-4748-98c6-05c4676f3c08", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking iteratively for config file. Start: A:\\Arbeitsaufgaben\\lang-main\\src\\lang_main, stop folder: src\n", + "Loaded TOML config file successfully.\n", + "Loaded config from: >>A:\\Arbeitsaufgaben\\lang-main\\lang_main_config.toml<<\n", + "Library path is: A:\\Arbeitsaufgaben\\lang-main\n", + "Root path is: A:\\Arbeitsaufgaben\n" + ] + } + ], "source": [ - "nx.write_graphml(G, save_pth)" + "from lang_main import io" ] }, { "cell_type": "code", - "execution_count": 84, - "id": "f01ebe25-56b9-410a-a2bf-d5a6e211de7a", + "execution_count": 5, + "id": "728fdce3-cfe0-4c4b-bcbf-c3d61547e94f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-12-19 12:24:12 +0000 | lang_main:io:INFO | Loaded file successfully.\n" + ] + } + ], "source": [ - "G_load = nx.read_graphml(save_pth, node_type=int)" + "G = io.load_pickle(save_pth)" ] }, { "cell_type": "code", - "execution_count": 85, - "id": "10bfad35-1f96-41a1-9014-578313502e6c", + "execution_count": 6, + "id": "fcb74134-4192-4d68-a535-f5a502d02b67", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "OutEdgeView([(1, 2), (1, 3), (1, 4), (2, 4), (2, 1), (3, 4)])" + "" ] }, - "execution_count": 85, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "G_load.edges" + "G.undirected" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "024560e5-373a-46f4-b0f7-7794535daa78", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "65745c09-e834-47c9-b761-aabd5fa03e57", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-12-19 11:41:58 +0000 | lang_main:graphs:INFO | Successfully saved graph as GraphML file under A:\\Arbeitsaufgaben\\lang-main\\notebooks\\tk_graph_built.graphml.\n" + ] + } + ], + "source": [ + "G.to_GraphML(Path.cwd(), 'tk_graph_built')" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "8da93a34-7bb1-4b9c-851b-0793c2c483bf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TokenGraph(name: TokenGraph, number of nodes: 13, number of edges: 9)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "G" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "1cc98c6e-4fa5-49ed-bb97-c62d8e103ccc", + "metadata": {}, + "outputs": [], + "source": [ + "G_copy = G.copy()\n", + "G_copy = G_copy.undirected" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "a5044a6f-8903-4e33-a56e-abf647019d8a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'degree_weighted': 2}\n", + "{'degree_weighted': 1}\n", + "{'degree_weighted': 4}\n", + "{'degree_weighted': 1}\n", + "{'degree_weighted': 1}\n", + "{'degree_weighted': 1}\n", + "{'degree_weighted': 2}\n", + "{'degree_weighted': 1}\n", + "{'degree_weighted': 1}\n", + "{'degree_weighted': 1}\n", + "{'degree_weighted': 1}\n", + "{'degree_weighted': 1}\n", + "{'degree_weighted': 1}\n" + ] + } + ], + "source": [ + "for node in G_copy.nodes:\n", + " print(G_copy.nodes[node])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "720015a4-7338-4fce-ac6c-38e691e0efa4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "b21a34c8-5748-42b1-947e-76c3ba02a240", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "1bcd4a31-2ea5-4123-8fc3-27017acd7259", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'weight': 1}\n", + "{'weight': 1}\n", + "{'weight': 20}\n", + "{'weight': 15}\n", + "{'weight': 10}\n", + "{'weight': 6}\n", + "{'weight': 1}\n", + "{'weight': 1}\n", + "{'weight': 1}\n" + ] + } + ], + "source": [ + "for edge in G_copy.edges:\n", + " print(G_copy.edges[edge])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfe5bacb-1f51-45d8-b920-ca9cba01282c", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "48e14d11-0319-47a9-81be-c696f69d55b3", + "metadata": {}, + "outputs": [], + "source": [ + "import networkx as nx" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "479deea9-a44f-48eb-95d3-7a326a83f62c", + "metadata": {}, + "outputs": [], + "source": [ + "mapping = nx.betweenness_centrality(G_copy, normalized=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "2299dc98-0a82-4009-8b38-790ffe3a3fd8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Kontrolle': 0.015151515151515152,\n", + " 'Lichtschranke': 0.0,\n", + " 'Überprüfung': 0.09090909090909091,\n", + " 'Spannrolle': 0.0,\n", + " 'Druckventil': 0.0,\n", + " 'Schmiernippel': 0.0,\n", + " 'Inspektion': 0.015151515151515152,\n", + " 'Förderbänder': 0.0,\n", + " 'Reinigung': 0.0,\n", + " 'Luftfilter': 0.0,\n", + " 'Schutzabdeckung': 0.0,\n", + " 'Ölstand': 0.0,\n", + " 'Hydraulik': 0.0}" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "4c00dfe4-29d3-45d8-9eec-6d8e1fbda910", + "metadata": {}, + "outputs": [], + "source": [ + "nx.set_node_attributes(G_copy, mapping, name='BC')" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "0e1119e1-700d-4ae0-bb36-6d4a8fe57698", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'degree_weighted': 2, 'BC': 0.015151515151515152}\n", + "{'degree_weighted': 1, 'BC': 0.0}\n", + "{'degree_weighted': 4, 'BC': 0.09090909090909091}\n", + "{'degree_weighted': 1, 'BC': 0.0}\n", + "{'degree_weighted': 1, 'BC': 0.0}\n", + "{'degree_weighted': 1, 'BC': 0.0}\n", + "{'degree_weighted': 2, 'BC': 0.015151515151515152}\n", + "{'degree_weighted': 1, 'BC': 0.0}\n", + "{'degree_weighted': 1, 'BC': 0.0}\n", + "{'degree_weighted': 1, 'BC': 0.0}\n", + "{'degree_weighted': 1, 'BC': 0.0}\n", + "{'degree_weighted': 1, 'BC': 0.0}\n", + "{'degree_weighted': 1, 'BC': 0.0}\n" + ] + } + ], + "source": [ + "nodes_prop_mapping = {}\n", + "for node in G_copy.nodes:\n", + " node_data = G_copy.nodes[node]\n", + " prio = node_data['degree_weighted'] * node_data['BC']\n", + " nodes_prop_mapping[node] = prio\n", + " print(G_copy.nodes[node])" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "2970267e-5c9e-472f-857b-382e2406f34d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Kontrolle': 0.030303030303030304,\n", + " 'Lichtschranke': 0.0,\n", + " 'Überprüfung': 0.36363636363636365,\n", + " 'Spannrolle': 0.0,\n", + " 'Druckventil': 0.0,\n", + " 'Schmiernippel': 0.0,\n", + " 'Inspektion': 0.030303030303030304,\n", + " 'Förderbänder': 0.0,\n", + " 'Reinigung': 0.0,\n", + " 'Luftfilter': 0.0,\n", + " 'Schutzabdeckung': 0.0,\n", + " 'Ölstand': 0.0,\n", + " 'Hydraulik': 0.0}" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nodes_prop_mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "ef2a1065-9a47-4df4-a7d9-ccfe30dbcdc7", + "metadata": {}, + "outputs": [], + "source": [ + "nx.set_node_attributes(G_copy, nodes_prop_mapping, name='prio')" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "2b77597a-be7a-498a-8374-f98324574619", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'degree_weighted': 2, 'BC': 0.015151515151515152, 'prio': 0.030303030303030304}\n", + "{'degree_weighted': 1, 'BC': 0.0, 'prio': 0.0}\n", + "{'degree_weighted': 4, 'BC': 0.09090909090909091, 'prio': 0.36363636363636365}\n", + "{'degree_weighted': 1, 'BC': 0.0, 'prio': 0.0}\n", + "{'degree_weighted': 1, 'BC': 0.0, 'prio': 0.0}\n", + "{'degree_weighted': 1, 'BC': 0.0, 'prio': 0.0}\n", + "{'degree_weighted': 2, 'BC': 0.015151515151515152, 'prio': 0.030303030303030304}\n", + "{'degree_weighted': 1, 'BC': 0.0, 'prio': 0.0}\n", + "{'degree_weighted': 1, 'BC': 0.0, 'prio': 0.0}\n", + "{'degree_weighted': 1, 'BC': 0.0, 'prio': 0.0}\n", + "{'degree_weighted': 1, 'BC': 0.0, 'prio': 0.0}\n", + "{'degree_weighted': 1, 'BC': 0.0, 'prio': 0.0}\n", + "{'degree_weighted': 1, 'BC': 0.0, 'prio': 0.0}\n" + ] + } + ], + "source": [ + "for node in G_copy.nodes:\n", + " print(G_copy.nodes[node])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "089ae831-f307-473d-8a70-e72bc580ca61", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81d9e8ad-4372-4e3b-a42b-e5b343802936", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ea0a1fc3-571f-4ce2-8561-b0a8f93e5072", + "metadata": {}, + "outputs": [], + "source": [ + "def build_init_graph():\n", + " edge_weights = [\n", + " {'weight': 1},\n", + " {'weight': 2},\n", + " {'weight': 3},\n", + " {'weight': 4},\n", + " {'weight': 5},\n", + " {'weight': 6},\n", + " ]\n", + " edges = [\n", + " (1, 2),\n", + " (1, 3),\n", + " (2, 4),\n", + " (3, 4),\n", + " (1, 4),\n", + " (2, 1),\n", + " ]\n", + " edges_to_add = []\n", + " for i, edge in enumerate(edges):\n", + " edge = list(edge)\n", + " edge.append(edge_weights[i]) # type: ignore\n", + " edges_to_add.append(tuple(edge))\n", + "\n", + " G = nx.DiGraph()\n", + "\n", + " G.add_edges_from(edges_to_add)\n", + "\n", + " return G" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "6e97af5f-f629-4c71-a1e4-2310ad2f3caa", + "metadata": {}, + "outputs": [], + "source": [ + "G_init = build_init_graph()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "4ff8fb6b-a10b-448f-b8a5-9989f4bee81e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{1: 0.16666666666666666, 2: 0.0, 3: 0.0, 4: 0.0}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nx.betweenness_centrality(G_init, normalized=True)" ] }, { diff --git a/notebooks/tk_graph_built.graphml b/notebooks/tk_graph_built.graphml new file mode 100644 index 0000000..73538e4 --- /dev/null +++ b/notebooks/tk_graph_built.graphml @@ -0,0 +1,73 @@ + + + + + + + 2 + + + 1 + + + 4 + + + 1 + + + 1 + + + 1 + + + 2 + + + 1 + + + 1 + + + 1 + + + 1 + + + 1 + + + 1 + + + 1 + + + 1 + + + 1 + + + 1 + + + 1 + + + 1 + + + 1 + + + 1 + + + 1 + + + diff --git a/notebooks/tk_graph_built.pkl b/notebooks/tk_graph_built.pkl new file mode 100644 index 0000000..15b3a5d Binary files /dev/null and b/notebooks/tk_graph_built.pkl differ diff --git a/publish.ps1 b/publish.ps1 new file mode 100644 index 0000000..d16fc9c --- /dev/null +++ b/publish.ps1 @@ -0,0 +1 @@ +pdm publish -r local --skip-existing \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 634c110..a48c764 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "lang-main" -version = "0.1.0a1" +version = "0.1.0a7" description = "Several tools to analyse TOM's data with strong focus on language processing" authors = [ {name = "d-opt GmbH, resp. Florian Förster", email = "f.foerster@d-opt.com"}, @@ -57,6 +57,19 @@ distribution = true [tool.pdm.build] package-dir = "src" +[tool.pdm.resolution] +respect-source-order = true + +[[tool.pdm.source]] +name = "private" +url = "http://localhost:8001/simple" +verify_ssl = false + +[[tool.pdm.source]] +name = "pypi" +url = "https://pypi.org/simple" +exclude_packages = ["lang-main*", "tom-plugin*"] + [tool.pdm.dev-dependencies] notebooks = [ "jupyterlab>=4.2.0", diff --git a/run_test_wo_models+cyto.ps1 b/run_test_wo_models+cyto.ps1 new file mode 100644 index 0000000..637c064 --- /dev/null +++ b/run_test_wo_models+cyto.ps1 @@ -0,0 +1,2 @@ +Remove-Item "./logs" -Force -Recurse +pdm run coverage run -m pytest -m "not mload and not cyto" \ No newline at end of file diff --git a/run_tests.ps1 b/run_tests_all.ps1 similarity index 79% rename from run_tests.ps1 rename to run_tests_all.ps1 index 17271f8..d3e0f79 100644 --- a/run_tests.ps1 +++ b/run_tests_all.ps1 @@ -1,3 +1,4 @@ +Remove-Item "./logs" -Force -Recurse pdm run pytest --cov -n 4 # run docker desktop . "C:\Program Files\Docker\Docker\Docker Desktop.exe" diff --git a/src/lang_main/analysis/graphs.py b/src/lang_main/analysis/graphs.py index 3ebce80..13425ba 100644 --- a/src/lang_main/analysis/graphs.py +++ b/src/lang_main/analysis/graphs.py @@ -16,12 +16,15 @@ from pandas import DataFrame from lang_main.constants import ( EDGE_WEIGHT_DECIMALS, LOGGING_DEFAULT_GRAPHS, + PROPERTY_NAME_BETWEENNESS_CENTRALITY, PROPERTY_NAME_DEGREE_WEIGHTED, + PROPERTY_NAME_IMPORTANCE, ) from lang_main.errors import ( EdgePropertyNotContainedError, EmptyEdgesError, EmptyGraphError, + NodePropertyNotContainedError, ) from lang_main.io import load_pickle, save_pickle from lang_main.loggers import logger_graphs as logger @@ -310,15 +313,98 @@ def add_weighted_degree( property of the edges which contains the weight information, by default 'weight' property_name : str, optional target name for the property containing the weighted degree in nodes, - by default 'degree_weighted' + by default PROPERTY_NAME_DEGREE_WEIGHTED """ - node_degree_mapping = cast( + node_property_mapping = cast( dict[str, float], dict(graph.degree(weight=edge_weight_property)), # type: ignore ) nx.set_node_attributes( graph, - node_degree_mapping, + node_property_mapping, + name=property_name, + ) + + +def add_betweenness_centrality( + graph: DiGraph | Graph, + edge_weight_property: str | None = None, + property_name: str = PROPERTY_NAME_BETWEENNESS_CENTRALITY, +) -> None: + """adds the betweenness centrality as property to each node of the given graph + Operation is performed inplace. + + Parameters + ---------- + graph : DiGraph | Graph + Graph with betweenness centrality as node property added inplace + edge_weight_property : str | None, optional + property of the edges which contains the weight information, + not necessarily needed, by default 'None' + property_name : str, optional + target name for the property containing the betweenness centrality in nodes, + by default PROPERTY_NAME_BETWEENNESS_CENTRALITY + """ + + node_property_mapping = cast( + dict[str, float], + nx.betweenness_centrality(graph, normalized=True, weight=edge_weight_property), # type: ignore + ) + nx.set_node_attributes( + graph, + node_property_mapping, + name=property_name, + ) + + +def add_importance_metric( + graph: DiGraph | Graph, + property_name: str = PROPERTY_NAME_IMPORTANCE, + property_name_weighted_degree: str = PROPERTY_NAME_DEGREE_WEIGHTED, + property_name_betweenness: str = PROPERTY_NAME_BETWEENNESS_CENTRALITY, +) -> None: + """Adds a custom importance metric as property to each node of the given graph. + Can be used to decide which nodes are of high importance and also to build node size + mappings. + Operation is performed inplace. + + Parameters + ---------- + graph : DiGraph | Graph + Graph with weighted degree as node property added inplace + property_name : str, optional + target name for the property containing the weighted degree in nodes, + by default PROPERTY_NAME_DEGREE_WEIGHTED + property_name_betweenness : str, optional + target name for the property containing the betweenness centrality in nodes, + by default PROPERTY_NAME_BETWEENNESS_CENTRALITY + """ + # build mapping for importance metric + node_property_mapping: dict[str, float] = {} + for node in cast(Iterable[str], graph.nodes): + node_data = cast(dict[str, float], graph.nodes[node]) + + if property_name_weighted_degree not in node_data: + raise NodePropertyNotContainedError( + ( + f'Node data does not contain weighted degree ' + f'with name {property_name_weighted_degree}.' + ) + ) + elif property_name_betweenness not in node_data: + raise NodePropertyNotContainedError( + ( + f'Node data does not contain betweenness centrality ' + f'with name {property_name_betweenness}.' + ) + ) + + prio = node_data[property_name_weighted_degree] * node_data[property_name_betweenness] + node_property_mapping[node] = prio + + nx.set_node_attributes( + graph, + node_property_mapping, name=property_name, ) @@ -351,6 +437,8 @@ def pipe_add_graph_metrics( for graph in graphs: graph_copy = copy.deepcopy(graph) add_weighted_degree(graph_copy) + add_betweenness_centrality(graph_copy) + add_importance_metric(graph_copy) collection.append(graph_copy) return tuple(collection) @@ -762,19 +850,3 @@ class TokenGraph(DiGraph): raise ValueError('File format not supported.') return graph - - # TODO check removal - # @classmethod - # def from_pickle( - # cls, - # path: str | Path, - # ) -> Self: - # if isinstance(path, str): - # path = Path(path) - - # if path.suffix not in ('.pkl', '.pickle'): - # raise ValueError('File format not supported.') - - # graph = typing.cast(Self, load_pickle(path)) - - # return graph diff --git a/src/lang_main/config.py b/src/lang_main/config.py index 845d1cd..42e4b00 100644 --- a/src/lang_main/config.py +++ b/src/lang_main/config.py @@ -19,7 +19,7 @@ except ImportError: # ** external packages config # ** Huggingface Hub caching -os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = 'set' +os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1' # ** py4cytoscape config if _has_py4cyto: @@ -36,7 +36,7 @@ BASE_FOLDERNAME: Final[str] = os.environ.get('LANG_MAIN_BASE_FOLDERNAME', 'lang- CONFIG_FILENAME: Final[str] = 'lang_main_config.toml' CYTO_STYLESHEET_FILENAME: Final[str] = r'cytoscape_config/lang_main.xml' PKG_DIR: Final[Path] = Path(__file__).parent -STOP_FOLDER: Final[str] = 'python' +STOP_FOLDER: Final[str] = os.environ.get('LANG_MAIN_STOP_SEARCH_FOLDERNAME', 'src') def load_toml_config( @@ -65,6 +65,7 @@ def load_cfg( starting_path: Path, glob_pattern: str, stop_folder_name: str | None, + lookup_cwd: bool = False, ) -> dict[str, Any]: """Look for configuration file. Internal configs are not used any more because the library behaviour is only guaranteed by external configurations. @@ -91,9 +92,10 @@ def load_cfg( LangMainConfigNotFoundError if no config file was found """ - cfg_path: Path | None - print('Looking for cfg file in CWD.', flush=True) - cfg_path = search_cwd(glob_pattern) + cfg_path: Path | None = None + if lookup_cwd: + print('Looking for cfg file in CWD.', flush=True) + cfg_path = search_cwd(glob_pattern) if cfg_path is None: print( diff --git a/src/lang_main/constants.py b/src/lang_main/constants.py index 5ce4001..7106572 100644 --- a/src/lang_main/constants.py +++ b/src/lang_main/constants.py @@ -54,17 +54,13 @@ PICKLE_PROTOCOL_VERSION: Final[int] = 5 # config placed in library path of application (usually "bin") input_path_cfg = LIB_PATH / Path(CONFIG['paths']['inputs']) INPUT_PATH_FOLDER: Final[Path] = input_path_cfg.resolve() -# TODO reactivate later -if not INPUT_PATH_FOLDER.exists(): +if not INPUT_PATH_FOLDER.exists(): # pragma: no cover raise FileNotFoundError(f'Input path >>{INPUT_PATH_FOLDER}<< does not exist.') save_path_cfg = LIB_PATH / Path(CONFIG['paths']['results']) SAVE_PATH_FOLDER: Final[Path] = save_path_cfg.resolve() -if not SAVE_PATH_FOLDER.exists(): +if not SAVE_PATH_FOLDER.exists(): # pragma: no cover raise FileNotFoundError(f'Output path >>{SAVE_PATH_FOLDER}<< does not exist.') -path_dataset_cfg = LIB_PATH / Path(CONFIG['paths']['dataset']) -PATH_TO_DATASET: Final[Path] = path_dataset_cfg.resolve() -# if not PATH_TO_DATASET.exists(): -# raise FileNotFoundError(f'Dataset path >>{PATH_TO_DATASET}<< does not exist.') + # ** control SKIP_PREPROCESSING: Final[bool] = CONFIG['control']['preprocessing_skip'] SKIP_TOKEN_ANALYSIS: Final[bool] = CONFIG['control']['token_analysis_skip'] @@ -82,22 +78,34 @@ MODEL_BASE_FOLDER: Final[Path] = model_folder_cfg.resolve() if not MODEL_BASE_FOLDER.exists(): raise FileNotFoundError('Language model folder not found.') os.environ['SENTENCE_TRANSFORMERS_HOME'] = str(MODEL_BASE_FOLDER) -SPACY_MODEL_NAME: Final[SpacyModelTypes] = SpacyModelTypes.DE_CORE_NEWS_SM -STFR_MODEL_NAME: Final[STFRModelTypes] = STFRModelTypes.ALL_MPNET_BASE_V2 + +# LANG_MAIN_BASE_FOLDERNAME : base folder of library, not root (folder in which Python installation is found) +# LANG_MAIN_SPACY_MODEL : spaCy model used; if not provided, use constant value defined in library; more internal use +# LANG_MAIN_STFR_MODEL : Sentence Transformer model used; if not provided, use constant value defined in library; more internal use +# LANG_MAIN_STFR_BACKEND : STFR backend, choice between "torch" and "onnx" + +SPACY_MODEL_NAME: Final[str | SpacyModelTypes] = os.environ.get( + 'LANG_MAIN_SPACY_MODEL', SpacyModelTypes.DE_CORE_NEWS_SM +) +STFR_MODEL_NAME: Final[str | STFRModelTypes] = os.environ.get( + 'LANG_MAIN_STFR_MODEL', STFRModelTypes.ALL_MPNET_BASE_V2 +) STFR_DEVICE: Final[STFRDeviceTypes] = STFRDeviceTypes.CPU STFR_SIMILARITY: Final[SimilarityFunction] = SimilarityFunction.COSINE -STFR_BACKEND: Final[STFRBackends] = STFRBackends.TORCH -STFR_MODEL_ARGS_DEFAULT: STFRModelArgs = {} -STFR_MODEL_ARGS_ONNX: STFRModelArgs = { +STFR_BACKEND: Final[str | STFRBackends] = os.environ.get( + 'LANG_MAIN_STFR_BACKEND', STFRBackends.TORCH +) +stfr_model_args_default: STFRModelArgs = {} +stfr_model_args_onnx: STFRModelArgs = { 'file_name': STFRQuantFilenames.ONNX_Q_UINT8, 'provider': ONNXExecutionProvider.CPU, 'export': False, } stfr_model_args: STFRModelArgs if STFR_BACKEND == STFRBackends.ONNX: - stfr_model_args = STFR_MODEL_ARGS_ONNX + stfr_model_args = stfr_model_args_onnx else: - stfr_model_args = STFR_MODEL_ARGS_DEFAULT + stfr_model_args = stfr_model_args_default STFR_MODEL_ARGS: Final[STFRModelArgs] = stfr_model_args # ** language dependency analysis @@ -122,6 +130,8 @@ THRESHOLD_SIMILARITY: Final[float] = CONFIG['preprocess']['threshold_similarity' EDGE_WEIGHT_DECIMALS: Final[int] = 4 THRESHOLD_EDGE_NUMBER: Final[int] = CONFIG['graph_postprocessing']['threshold_edge_number'] PROPERTY_NAME_DEGREE_WEIGHTED: Final[str] = 'degree_weighted' +PROPERTY_NAME_BETWEENNESS_CENTRALITY: Final[str] = 'betweenness_centrality' +PROPERTY_NAME_IMPORTANCE: Final[str] = 'importance' # ** graph exports (Cytoscape) CYTO_MAX_NODE_COUNT: Final[int] = 500 diff --git a/src/lang_main/env_vars.txt b/src/lang_main/env_vars.txt new file mode 100644 index 0000000..f18898e --- /dev/null +++ b/src/lang_main/env_vars.txt @@ -0,0 +1,6 @@ +# list of all library's environment variables +LANG_MAIN_STOP_SEARCH_FOLDERNAME : foldername in package directory tree at which the lookup should stop; used to find directory root +LANG_MAIN_BASE_FOLDERNAME : base folder of library, not root (folder in which Python installation is found) +LANG_MAIN_SPACY_MODEL : spaCy model used; if not provided, use constant value defined in library; more internal use +LANG_MAIN_STFR_MODEL : Sentence Transformer model used; if not provided, use constant value defined in library; more internal use +LANG_MAIN_STFR_BACKEND : STFR backend, choice between "torch" and "onnx" \ No newline at end of file diff --git a/src/lang_main/errors.py b/src/lang_main/errors.py index c544674..213b2a4 100644 --- a/src/lang_main/errors.py +++ b/src/lang_main/errors.py @@ -8,6 +8,10 @@ class LanguageModelNotFoundError(Exception): # ** token graph exceptions +class NodePropertyNotContainedError(Exception): + """Error raised if a needed node property is not contained in graph edges""" + + class EdgePropertyNotContainedError(Exception): """Error raised if a needed edge property is not contained in graph edges""" diff --git a/src/lang_main/lang_main_config.toml b/src/lang_main/lang_main_config.toml index b77c206..7b187f0 100644 --- a/src/lang_main/lang_main_config.toml +++ b/src/lang_main/lang_main_config.toml @@ -1,22 +1,18 @@ # lang_main: Config file -[info] -pkg = 'lang_main_internal' [paths] -inputs = './data/in/' +inputs = '../data/in/' # results = './results/dummy_N_1000/' # dataset = '../data/Dummy_Dataset_N_1000.csv' -results = './data/out/' -dataset = '../data/02_202307/Export4.csv' -models = '../../lang-models' +results = '../data/out/' +models = './lang-models' [logging] enabled = true stderr = true file = true -# only debugging features, production-ready pipelines should always -# be fully executed +# control which pipelines are executed [control] preprocessing_skip = false token_analysis_skip = false diff --git a/src/lang_main/loggers.py b/src/lang_main/loggers.py index 35a575b..41c9c72 100644 --- a/src/lang_main/loggers.py +++ b/src/lang_main/loggers.py @@ -33,7 +33,7 @@ if ENABLE_LOGGING and LOGGING_TO_STDERR: logger_all_handler_stderr = logging.StreamHandler() logger_all_handler_stderr.setLevel(LOGGING_LEVEL_STDERR) logger_all_handler_stderr.setFormatter(logger_all_formater) -else: +else: # pragma: no cover logger_all_handler_stderr = null_handler if ENABLE_LOGGING and LOGGING_TO_FILE: @@ -45,7 +45,7 @@ if ENABLE_LOGGING and LOGGING_TO_FILE: ) logger_all_handler_file.setLevel(LOGGING_LEVEL_FILE) logger_all_handler_file.setFormatter(logger_all_formater) -else: +else: # pragma: no cover logger_all_handler_file = null_handler diff --git a/src/lang_main/pipelines/base.py b/src/lang_main/pipelines/base.py index 322269c..343685f 100644 --- a/src/lang_main/pipelines/base.py +++ b/src/lang_main/pipelines/base.py @@ -33,6 +33,7 @@ class BasePipeline(ABC): # container for actions to perform during pass self.actions: list[Callable] = [] self.action_names: list[str] = [] + self.action_skip: list[bool] = [] # progress tracking, start at 1 self.curr_proc_idx: int = 1 @@ -104,8 +105,6 @@ class PipelineContainer(BasePipeline): ) -> None: super().__init__(name=name, working_dir=working_dir) - self.action_skip: list[bool] = [] - @override def add( self, @@ -170,6 +169,7 @@ class Pipeline(BasePipeline): self, action: Callable, action_kwargs: dict[str, Any] | None = None, + skip: bool = False, save_result: bool = False, load_result: bool = False, filename: str | None = None, @@ -183,6 +183,7 @@ class Pipeline(BasePipeline): self.actions.append(action) self.action_names.append(action.__name__) self.actions_kwargs.append(action_kwargs.copy()) + self.action_skip.append(skip) self.save_results.append((save_result, filename)) self.load_results.append((load_result, filename)) else: @@ -235,7 +236,13 @@ class Pipeline(BasePipeline): self, starting_values: tuple[Any, ...] | None = None, ) -> tuple[Any, ...]: + first_performed: bool = False + for idx, (action, action_kwargs) in enumerate(zip(self.actions, self.actions_kwargs)): + if self.action_skip[idx]: + self.curr_proc_idx += 1 + continue + # loading if self.load_results[idx][0]: filename = self.load_results[idx][1] @@ -248,8 +255,9 @@ class Pipeline(BasePipeline): self.curr_proc_idx += 1 continue # calculation - if idx == 0: + if not first_performed: args = starting_values + first_performed = True else: args = ret diff --git a/src/lang_main/render/cytoscape.py b/src/lang_main/render/cytoscape.py index a4a918e..9af6ff7 100644 --- a/src/lang_main/render/cytoscape.py +++ b/src/lang_main/render/cytoscape.py @@ -296,7 +296,7 @@ def apply_style_to_network( style_name: str = CYTO_STYLESHEET_NAME, pth_to_stylesheet: Path = CYTO_PATH_STYLESHEET, network_name: str = CYTO_BASE_NETWORK_NAME, - node_size_property: str = 'node_selection', + node_size_property: str = CYTO_SELECTION_PROPERTY, min_node_size: int = 15, max_node_size: int = 40, sandbox_name: str = CYTO_SANDBOX_NAME, diff --git a/tests/analysis/test_graphs.py b/tests/analysis/test_graphs.py index 2d59df3..1145b46 100644 --- a/tests/analysis/test_graphs.py +++ b/tests/analysis/test_graphs.py @@ -2,7 +2,12 @@ import networkx as nx import pytest from lang_main.analysis import graphs -from lang_main.errors import EmptyEdgesError, EmptyGraphError, EdgePropertyNotContainedError +from lang_main.errors import ( + EdgePropertyNotContainedError, + EmptyEdgesError, + EmptyGraphError, + NodePropertyNotContainedError, +) TK_GRAPH_NAME = 'TEST_TOKEN_GRAPH' @@ -231,6 +236,49 @@ def test_add_weighted_degree(): assert graph_obj.nodes[3][property_name] == 6 +def test_add_betweenness_centrality(): + graph_obj = build_init_graph(token_graph=False) + property_name = 'betweenness_centrality' + graphs.add_betweenness_centrality(graph_obj, property_name=property_name) + assert round(graph_obj.nodes[1][property_name], 4) == pytest.approx(0.1667) + assert graph_obj.nodes[2][property_name] == 0 + assert graph_obj.nodes[3][property_name] == 0 + + +def test_add_importance_metric(): + graph_obj = build_init_graph(token_graph=False) + property_name_WD = 'degree_weighted' + graphs.add_weighted_degree(graph_obj, 'weight', property_name_WD) + property_name_BC = 'betweenness_centrality' + graphs.add_betweenness_centrality(graph_obj, property_name=property_name_BC) + property_name = 'importance' + graphs.add_importance_metric( + graph_obj, + property_name=property_name, + property_name_weighted_degree=property_name_WD, + property_name_betweenness=property_name_BC, + ) + assert round(graph_obj.nodes[1][property_name], 4) == pytest.approx(2.3333) + assert graph_obj.nodes[2][property_name] == 0 + assert graph_obj.nodes[3][property_name] == 0 + + with pytest.raises(NodePropertyNotContainedError): + graphs.add_importance_metric( + graph_obj, + property_name=property_name, + property_name_weighted_degree='prop_not_contained', + property_name_betweenness=property_name_BC, + ) + + with pytest.raises(NodePropertyNotContainedError): + graphs.add_importance_metric( + graph_obj, + property_name=property_name, + property_name_weighted_degree=property_name_WD, + property_name_betweenness='prop_not_contained', + ) + + def test_static_graph_analysis(): graph_obj = build_init_graph(token_graph=True) (graph_obj,) = graphs.static_graph_analysis(graph_obj) # type: ignore @@ -254,6 +302,20 @@ def test_pipe_add_graph_metrics(): assert graph_collection[1].nodes[1][property_name] == 14 assert graph_collection[1].nodes[2][property_name] == 10 assert graph_collection[1].nodes[3][property_name] == 6 + property_name = 'betweenness_centrality' + assert round(graph_collection[0].nodes[1][property_name], 4) == pytest.approx(0.1667) + assert graph_collection[0].nodes[2][property_name] == 0 + assert graph_collection[0].nodes[3][property_name] == 0 + assert round(graph_collection[1].nodes[1][property_name], 4) == pytest.approx(0.1667) + assert graph_collection[1].nodes[2][property_name] == 0 + assert graph_collection[1].nodes[3][property_name] == 0 + property_name = 'importance' + assert round(graph_collection[0].nodes[1][property_name], 4) == pytest.approx(2.3333) + assert graph_collection[0].nodes[2][property_name] == 0 + assert graph_collection[0].nodes[3][property_name] == 0 + assert round(graph_collection[1].nodes[1][property_name], 4) == pytest.approx(2.3333) + assert graph_collection[1].nodes[2][property_name] == 0 + assert graph_collection[1].nodes[3][property_name] == 0 def test_pipe_rescale_graph_edge_weights(tk_graph): diff --git a/tests/pipelines/test_base.py b/tests/pipelines/test_base.py index 51178db..cc636ef 100644 --- a/tests/pipelines/test_base.py +++ b/tests/pipelines/test_base.py @@ -121,6 +121,7 @@ def test_pipeline_valid(pipeline, alter_content): assert len(pipe.actions) == 1 assert len(pipe.action_names) == 1 assert len(pipe.actions_kwargs) == 1 + assert len(pipe.action_skip) == 1 assert len(pipe.save_results) == 1 assert len(pipe.load_results) == 1 assert pipe.save_results[0] == (False, None) @@ -166,7 +167,7 @@ def test_pipeline_valid_action_load(pipeline, working_dir): test_string = 'test' # action preparation - def valid_action(string, add_content=False): + def valid_action(string, add_content=False): # pragma: no cover if add_content: string += '_2' return string @@ -175,6 +176,7 @@ def test_pipeline_valid_action_load(pipeline, working_dir): assert len(pipe.actions) == 1 assert len(pipe.action_names) == 1 assert len(pipe.actions_kwargs) == 1 + assert len(pipe.action_skip) == 1 assert len(pipe.save_results) == 1 assert len(pipe.load_results) == 1 assert pipe.save_results[0] == (False, None) @@ -209,19 +211,28 @@ def test_pipeline_multiple_actions(pipeline): string += '_3' return string + pipe.add(valid_action, {'add_content': True}, skip=True) pipe.add(valid_action, {'add_content': True}) pipe.add(valid_action_2) - assert len(pipe.actions) == 2 - assert len(pipe.action_names) == 2 - assert len(pipe.actions_kwargs) == 2 - assert len(pipe.save_results) == 2 - assert len(pipe.load_results) == 2 + assert len(pipe.actions) == 3 + assert len(pipe.action_names) == 3 + assert len(pipe.actions_kwargs) == 3 + assert len(pipe.action_skip) == 3 + assert len(pipe.save_results) == 3 + assert len(pipe.load_results) == 3 assert pipe.save_results[1] == (False, None) assert pipe.load_results[1] == (False, None) ret = pipe.run(starting_values=(test_string,)) assert isinstance(ret, tuple) assert pipe._intermediate_result == ret - assert pipe.curr_proc_idx == 3 + assert pipe.curr_proc_idx == 4 assert ret is not None assert ret[0] == 'test_2_3' + + +def test_pipeline_invalid_action(pipeline): + test_string = 'test' + + with pytest.raises(WrongActionTypeError): + pipeline.add(test_string, skip=False) diff --git a/tests/render/test_import.py b/tests/render/test_import.py new file mode 100644 index 0000000..8308577 --- /dev/null +++ b/tests/render/test_import.py @@ -0,0 +1,54 @@ +import builtins +import importlib.util +from importlib import reload + +import pytest + +from lang_main.errors import DependencyMissingError + + +@pytest.fixture(scope='function') +def no_dep(monkeypatch): + import_orig = builtins.__import__ + + def mocked_import(name, globals, locals, fromlist, level): + if name == 'py4cytoscape': + raise ImportError() + return import_orig(name, locals, fromlist, level) + + monkeypatch.setattr(builtins, '__import__', mocked_import) + + +@pytest.fixture(scope='function') +def patch_find_spec(monkeypatch): + find_spec_orig = importlib.util.find_spec + + def mocked_find_spec(*args, **kwargs): + if args[0] == 'py4cytoscape': + return None + else: + return find_spec_orig(*args, **kwargs) + + monkeypatch.setattr(importlib.util, 'find_spec', mocked_find_spec) + + +def test_p4c_available(): + import lang_main.constants + + reload(lang_main.constants) + + assert lang_main.constants.Dependencies.PY4C.value + + +@pytest.mark.usefixtures('patch_find_spec') +def test_p4c_missing(monkeypatch): + import lang_main.constants + + reload(lang_main.constants) + + assert not lang_main.constants.Dependencies.PY4C.value + + with pytest.raises(DependencyMissingError): + from lang_main import render + + reload(render) diff --git a/tests/test_config.py b/tests/test_config.py index a5c4db7..70ec918 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -16,7 +16,7 @@ def test_p4c_dependency(): def test_load_config(): toml_path = config.PKG_DIR / 'lang_main_config.toml' loaded_cfg = config.load_toml_config(toml_path) - assert loaded_cfg['info']['pkg'] == 'lang_main_internal' + assert loaded_cfg['paths']['models'] == './lang-models' def test_get_config_path(): @@ -36,7 +36,7 @@ def test_get_config_path(): assert cyto_internal == cyto_cfg_pth -def test_load_cfg(monkeypatch, tmp_path): +def test_load_cfg_func(monkeypatch, tmp_path): monkeypatch.setattr(Path, 'cwd', lambda: tmp_path) pkg_dir = config.PKG_DIR filename = config.CONFIG_FILENAME @@ -44,21 +44,20 @@ def test_load_cfg(monkeypatch, tmp_path): cfg_pth_internal = (pkg_dir / filename).resolve() ref_config = config.load_toml_config(cfg_pth_internal) + assert ref_config['paths']['models'] == './lang-models' - assert ref_config['info']['pkg'] == 'lang_main_internal' loaded_cfg = config.load_cfg( starting_path=pkg_dir, glob_pattern=filename, stop_folder_name=stop_folder, - cfg_path_internal=cfg_pth_internal, - prefer_internal_config=True, + lookup_cwd=False, ) - assert loaded_cfg['info']['pkg'] == 'lang_main_internal' + assert loaded_cfg['paths']['models'] == '../lang-models' + loaded_cfg = config.load_cfg( starting_path=pkg_dir, glob_pattern=filename, stop_folder_name=stop_folder, - cfg_path_internal=cfg_pth_internal, - prefer_internal_config=False, + lookup_cwd=True, ) - assert loaded_cfg['info']['pkg'] == 'lang_main' + assert loaded_cfg['paths']['models'] == '../lang-models' diff --git a/tests/test_model_loader.py b/tests/test_model_loader.py index f734409..914b67c 100644 --- a/tests/test_model_loader.py +++ b/tests/test_model_loader.py @@ -4,12 +4,12 @@ from spacy.language import Language from lang_main import model_loader from lang_main.constants import ( - STFR_MODEL_ARGS_ONNX, SimilarityFunction, SpacyModelTypes, STFRBackends, STFRDeviceTypes, STFRModelTypes, + stfr_model_args_onnx, ) from lang_main.errors import LanguageModelNotFoundError from lang_main.types import LanguageModels @@ -69,7 +69,7 @@ def test_load_sentence_transformer_onnx(model_name, similarity_func) -> None: similarity_func=similarity_func, backend=STFRBackends.ONNX, device=STFRDeviceTypes.CPU, - model_kwargs=STFR_MODEL_ARGS_ONNX, # type: ignore + model_kwargs=stfr_model_args_onnx, # type: ignore ) assert isinstance(model, SentenceTransformer) diff --git a/tests/work_dir/Pipe-test_Step-2_valid_action_2.pkl b/tests/work_dir/Pipe-test_Step-2_valid_action_2.pkl index 900f352..0872859 100644 Binary files a/tests/work_dir/Pipe-test_Step-2_valid_action_2.pkl and b/tests/work_dir/Pipe-test_Step-2_valid_action_2.pkl differ diff --git a/tests/work_dir/Pipe-test_Step-3_valid_action_2.pkl b/tests/work_dir/Pipe-test_Step-3_valid_action_2.pkl new file mode 100644 index 0000000..900f352 Binary files /dev/null and b/tests/work_dir/Pipe-test_Step-3_valid_action_2.pkl differ