add first code, exclude Fraunhofer

This commit is contained in:
Florian Förster 2026-01-13 11:19:02 +01:00
parent 4b59bf6089
commit f65e1f2981
2 changed files with 130 additions and 1 deletions

3
.gitignore vendored
View File

@ -1,11 +1,12 @@
# own # own
prototypes/ prototypes/Fraunhofer/
data/ data/
reports/ reports/
*.code-workspace *.code-workspace
# credentials # credentials
CREDENTIALS* CREDENTIALS*
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
__pycache__/ __pycache__/
*.py[cod] *.py[cod]

View File

@ -0,0 +1,128 @@
# %%
import json
import pprint
from collections import Counter
from pathlib import Path
import pandas as pd
import polars as pl
WRITE_TO_DISK = False
# %%
p_data_base = (Path.cwd() / "../data/Datenauszug_20251212").resolve()
assert p_data_base.exists()
print("Total number of JSON files")
len(tuple(p_data_base.glob("**/*.json")))
# %%
# // check size contents of folder
folders = tuple(p_data_base.glob("*"))
folder_items = {}
for folder in folders:
num_elements = len(tuple(folder.glob("*.json")))
if num_elements == 0:
continue
folder_items[folder] = num_elements
print(f"Folder {folder} contains: {num_elements} items")
# %%
max_item_folder = Path()
max_val = float("-inf")
for k, v in folder_items.items():
if v > max_val:
max_item_folder = k
max_val = v
print(f"Maximum number of items : {max_val}\nFolder: {max_item_folder}")
# %%
pprint.pprint(folder_items)
# %%
# analyse type numbers for each folder
folder_to_types = []
for idx, folder in enumerate(folders):
folder_types = []
for file in folder.glob("*.json"):
with open(file, "r") as f:
data = json.load(f)
type_num = data["initial"]["dsc_TypeNumber"]["value"]
folder_types.append(type_num)
type_num_count = Counter(folder_types)
folder_to_types.append((folder.name, type_num_count))
typenum_counter = Counter()
for idx in range(len(folder_to_types)):
typenum_counter.update(folder_to_types[idx][1])
max_type_num, type_num_count = typenum_counter.most_common(1)[0]
print(f"Max type number is: {max_type_num}")
print(f"Number of occurrences: {type_num_count}")
# %%
typenum_counter
# %%
folder_to_types
# %%
# ** one file is one curve
# concatenate all files in one table
# %%
# one time-series entry is defined by its schema:
# [ts, ps, pressure, valve]
# [timestep, process_step, pressure_value, valve_value]
# valid states are ps = [101, 102, 110]
schema = {
"DU1260": pl.Float64,
"V1560": pl.Boolean,
"ps": pl.UInt32,
"ts": pl.String,
"type_num": pl.UInt8,
"id": pl.UInt64,
}
df = pl.DataFrame(schema=schema)
count = 0
for idx, file in enumerate(p_data_base.glob("**/*.json")):
with open(file, "r") as f:
data = json.load(f)
type_num = data["initial"]["dsc_TypeNumber"]["value"]
df_file = pl.DataFrame(data["rows"], schema_overrides=schema)
df_file = df_file.with_columns(
pl.lit(type_num).alias("type_num").cast(pl.UInt8),
pl.lit(idx).alias("id").cast(pl.UInt64),
)
df = pl.concat((df, df_file))
count += 1
df = df.with_columns(pl.col("ts").str.to_datetime(time_zone="UTC"))
df = df.select(["id", "type_num", "ts", "ps", "DU1260", "V1560"])
df.head()
# %%
print(f"Files processed: {count}")
print(f"Length of obtained data: {len(df)}")
# %%
concat_data = p_data_base / "all_data.parquet"
if WRITE_TO_DISK:
df.write_parquet(concat_data)
else:
df = pl.read_parquet(concat_data)
# %%
df.head()
print(f"Number of entries in data: {len(df)}")
print(f"Number of curves in data: {len(df.select('id').unique())}")
# %%
# valid ps = 101, 102, 110
# filter all entries which contain invalid error states
invalid_ids = df.filter(~pl.col("ps").is_in((101, 102, 110))).select("id").unique()
print(f"Number of invalid IDs: {len(invalid_ids)}")
df = df.filter(~pl.col("id").is_in(invalid_ids["id"].implode()))
print(f"Number of curves in data after cleansing: {len(df.select('id').unique())}")
# %%
df.select(["ts", "DU1260"])
# %%
df.plot.line(x="ts", y="DU1260")
# %%