From f65e1f29817900c61e9e762f853af0977b4bc2d9 Mon Sep 17 00:00:00 2001
From: foefl <f.foerster@d-opt.com>
Date: Tue, 13 Jan 2026 11:19:02 +0100
Subject: [PATCH] add first code, exclude Fraunhofer

---
 .gitignore                     |   3 +-
 prototypes/01_first_analyse.py | 128 +++++++++++++++++++++++++++++++++
 2 files changed, 130 insertions(+), 1 deletion(-)
 create mode 100644 prototypes/01_first_analyse.py

diff --git a/.gitignore b/.gitignore
index 21e6a91..b4462ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,11 +1,12 @@
 # own
-prototypes/
+prototypes/Fraunhofer/
 data/
 reports/
 *.code-workspace
 # credentials
 CREDENTIALS*
 
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/prototypes/01_first_analyse.py b/prototypes/01_first_analyse.py
new file mode 100644
index 0000000..d5c8883
--- /dev/null
+++ b/prototypes/01_first_analyse.py
@@ -0,0 +1,128 @@
+# %%
+import json
+import pprint
+from collections import Counter
+from pathlib import Path
+
+import pandas as pd
+import polars as pl
+
+WRITE_TO_DISK = False
+
+# %%
+p_data_base = (Path.cwd() / "../data/Datenauszug_20251212").resolve()
+assert p_data_base.exists()
+
+print("Total number of JSON files")
+len(tuple(p_data_base.glob("**/*.json")))
+
+# %%
+# // check size contents of folder
+folders = tuple(p_data_base.glob("*"))
+folder_items = {}
+
+for folder in folders:
+    num_elements = len(tuple(folder.glob("*.json")))
+    if num_elements == 0:
+        continue
+    folder_items[folder] = num_elements
+    print(f"Folder {folder} contains: {num_elements} items")
+# %%
+max_item_folder = Path()
+max_val = float("-inf")
+
+for k, v in folder_items.items():
+    if v > max_val:
+        max_item_folder = k
+        max_val = v
+
+print(f"Maximum number of items : {max_val}\nFolder: {max_item_folder}")
+# %%
+pprint.pprint(folder_items)
+
+# %%
+# analyse type numbers for each folder
+folder_to_types = []
+for idx, folder in enumerate(folders):
+    folder_types = []
+    for file in folder.glob("*.json"):
+        with open(file, "r") as f:
+            data = json.load(f)
+
+        type_num = data["initial"]["dsc_TypeNumber"]["value"]
+        folder_types.append(type_num)
+
+    type_num_count = Counter(folder_types)
+    folder_to_types.append((folder.name, type_num_count))
+
+typenum_counter = Counter()
+for idx in range(len(folder_to_types)):
+    typenum_counter.update(folder_to_types[idx][1])
+
+max_type_num, type_num_count = typenum_counter.most_common(1)[0]
+print(f"Max type number is: {max_type_num}")
+print(f"Number of occurrences: {type_num_count}")
+# %%
+typenum_counter
+# %%
+folder_to_types
+# %%
+# ** one file is one curve
+# concatenate all files in one table
+# %%
+# one time-series entry is defined by its schema:
+# [ts, ps, pressure, valve]
+# [timestep, process_step, pressure_value, valve_value]
+# valid states are ps = [101, 102, 110]
+
+schema = {
+    "DU1260": pl.Float64,
+    "V1560": pl.Boolean,
+    "ps": pl.UInt32,
+    "ts": pl.String,
+    "type_num": pl.UInt8,
+    "id": pl.UInt64,
+}
+df = pl.DataFrame(schema=schema)
+count = 0
+for idx, file in enumerate(p_data_base.glob("**/*.json")):
+    with open(file, "r") as f:
+        data = json.load(f)
+
+    type_num = data["initial"]["dsc_TypeNumber"]["value"]
+    df_file = pl.DataFrame(data["rows"], schema_overrides=schema)
+    df_file = df_file.with_columns(
+        pl.lit(type_num).alias("type_num").cast(pl.UInt8),
+        pl.lit(idx).alias("id").cast(pl.UInt64),
+    )
+    df = pl.concat((df, df_file))
+    count += 1
+
+df = df.with_columns(pl.col("ts").str.to_datetime(time_zone="UTC"))
+df = df.select(["id", "type_num", "ts", "ps", "DU1260", "V1560"])
+df.head()
+# %%
+print(f"Files processed: {count}")
+print(f"Length of obtained data: {len(df)}")
+# %%
+concat_data = p_data_base / "all_data.parquet"
+if WRITE_TO_DISK:
+    df.write_parquet(concat_data)
+else:
+    df = pl.read_parquet(concat_data)
+# %%
+df.head()
+print(f"Number of entries in data: {len(df)}")
+print(f"Number of curves in data: {len(df.select('id').unique())}")
+# %%
+# valid ps = 101, 102, 110
+# filter all entries which contain invalid error states
+invalid_ids = df.filter(~pl.col("ps").is_in((101, 102, 110))).select("id").unique()
+print(f"Number of invalid IDs: {len(invalid_ids)}")
+df = df.filter(~pl.col("id").is_in(invalid_ids["id"].implode()))
+print(f"Number of curves in data after cleansing: {len(df.select('id').unique())}")
+# %%
+df.select(["ts", "DU1260"])
+# %%
+df.plot.line(x="ts", y="DU1260")
+# %%