From f65e1f29817900c61e9e762f853af0977b4bc2d9 Mon Sep 17 00:00:00 2001 From: foefl Date: Tue, 13 Jan 2026 11:19:02 +0100 Subject: [PATCH] add first code, exclude Fraunhofer --- .gitignore | 3 +- prototypes/01_first_analyse.py | 128 +++++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 prototypes/01_first_analyse.py diff --git a/.gitignore b/.gitignore index 21e6a91..b4462ae 100644 --- a/.gitignore +++ b/.gitignore @@ -1,11 +1,12 @@ # own -prototypes/ +prototypes/Fraunhofer/ data/ reports/ *.code-workspace # credentials CREDENTIALS* + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/prototypes/01_first_analyse.py b/prototypes/01_first_analyse.py new file mode 100644 index 0000000..d5c8883 --- /dev/null +++ b/prototypes/01_first_analyse.py @@ -0,0 +1,128 @@ +# %% +import json +import pprint +from collections import Counter +from pathlib import Path + +import pandas as pd +import polars as pl + +WRITE_TO_DISK = False + +# %% +p_data_base = (Path.cwd() / "../data/Datenauszug_20251212").resolve() +assert p_data_base.exists() + +print("Total number of JSON files") +len(tuple(p_data_base.glob("**/*.json"))) + +# %% +# // check size contents of folder +folders = tuple(p_data_base.glob("*")) +folder_items = {} + +for folder in folders: + num_elements = len(tuple(folder.glob("*.json"))) + if num_elements == 0: + continue + folder_items[folder] = num_elements + print(f"Folder {folder} contains: {num_elements} items") +# %% +max_item_folder = Path() +max_val = float("-inf") + +for k, v in folder_items.items(): + if v > max_val: + max_item_folder = k + max_val = v + +print(f"Maximum number of items : {max_val}\nFolder: {max_item_folder}") +# %% +pprint.pprint(folder_items) + +# %% +# analyse type numbers for each folder +folder_to_types = [] +for idx, folder in enumerate(folders): + folder_types = [] + for file in folder.glob("*.json"): + with open(file, "r") as f: + data = json.load(f) + + type_num = data["initial"]["dsc_TypeNumber"]["value"] + folder_types.append(type_num) + + type_num_count = Counter(folder_types) + folder_to_types.append((folder.name, type_num_count)) + +typenum_counter = Counter() +for idx in range(len(folder_to_types)): + typenum_counter.update(folder_to_types[idx][1]) + +max_type_num, type_num_count = typenum_counter.most_common(1)[0] +print(f"Max type number is: {max_type_num}") +print(f"Number of occurrences: {type_num_count}") +# %% +typenum_counter +# %% +folder_to_types +# %% +# ** one file is one curve +# concatenate all files in one table +# %% +# one time-series entry is defined by its schema: +# [ts, ps, pressure, valve] +# [timestep, process_step, pressure_value, valve_value] +# valid states are ps = [101, 102, 110] + +schema = { + "DU1260": pl.Float64, + "V1560": pl.Boolean, + "ps": pl.UInt32, + "ts": pl.String, + "type_num": pl.UInt8, + "id": pl.UInt64, +} +df = pl.DataFrame(schema=schema) +count = 0 +for idx, file in enumerate(p_data_base.glob("**/*.json")): + with open(file, "r") as f: + data = json.load(f) + + type_num = data["initial"]["dsc_TypeNumber"]["value"] + df_file = pl.DataFrame(data["rows"], schema_overrides=schema) + df_file = df_file.with_columns( + pl.lit(type_num).alias("type_num").cast(pl.UInt8), + pl.lit(idx).alias("id").cast(pl.UInt64), + ) + df = pl.concat((df, df_file)) + count += 1 + +df = df.with_columns(pl.col("ts").str.to_datetime(time_zone="UTC")) +df = df.select(["id", "type_num", "ts", "ps", "DU1260", "V1560"]) +df.head() +# %% +print(f"Files processed: {count}") +print(f"Length of obtained data: {len(df)}") +# %% +concat_data = p_data_base / "all_data.parquet" +if WRITE_TO_DISK: + df.write_parquet(concat_data) +else: + df = pl.read_parquet(concat_data) +# %% +df.head() +print(f"Number of entries in data: {len(df)}") +print(f"Number of curves in data: {len(df.select('id').unique())}") +# %% +# valid ps = 101, 102, 110 +# filter all entries which contain invalid error states +invalid_ids = df.filter(~pl.col("ps").is_in((101, 102, 110))).select("id").unique() +print(f"Number of invalid IDs: {len(invalid_ids)}") +df = df.filter(~pl.col("id").is_in(invalid_ids["id"].implode())) +print(f"Number of curves in data after cleansing: {len(df.select('id').unique())}") +# %% +df.select(["ts", "DU1260"]) +# %% +df.plot.line(x="ts", y="DU1260") +# %%