generated from dopt-python/py311
add first code, exclude Fraunhofer
This commit is contained in:
parent
4b59bf6089
commit
f65e1f2981
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,11 +1,12 @@
|
|||||||
# own
|
# own
|
||||||
prototypes/
|
prototypes/Fraunhofer/
|
||||||
data/
|
data/
|
||||||
reports/
|
reports/
|
||||||
*.code-workspace
|
*.code-workspace
|
||||||
# credentials
|
# credentials
|
||||||
CREDENTIALS*
|
CREDENTIALS*
|
||||||
|
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
|
|||||||
128
prototypes/01_first_analyse.py
Normal file
128
prototypes/01_first_analyse.py
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
# %%
|
||||||
|
import json
|
||||||
|
import pprint
|
||||||
|
from collections import Counter
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import polars as pl
|
||||||
|
|
||||||
|
WRITE_TO_DISK = False
|
||||||
|
|
||||||
|
# %%
|
||||||
|
p_data_base = (Path.cwd() / "../data/Datenauszug_20251212").resolve()
|
||||||
|
assert p_data_base.exists()
|
||||||
|
|
||||||
|
print("Total number of JSON files")
|
||||||
|
len(tuple(p_data_base.glob("**/*.json")))
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# // check size contents of folder
|
||||||
|
folders = tuple(p_data_base.glob("*"))
|
||||||
|
folder_items = {}
|
||||||
|
|
||||||
|
for folder in folders:
|
||||||
|
num_elements = len(tuple(folder.glob("*.json")))
|
||||||
|
if num_elements == 0:
|
||||||
|
continue
|
||||||
|
folder_items[folder] = num_elements
|
||||||
|
print(f"Folder {folder} contains: {num_elements} items")
|
||||||
|
# %%
|
||||||
|
max_item_folder = Path()
|
||||||
|
max_val = float("-inf")
|
||||||
|
|
||||||
|
for k, v in folder_items.items():
|
||||||
|
if v > max_val:
|
||||||
|
max_item_folder = k
|
||||||
|
max_val = v
|
||||||
|
|
||||||
|
print(f"Maximum number of items : {max_val}\nFolder: {max_item_folder}")
|
||||||
|
# %%
|
||||||
|
pprint.pprint(folder_items)
|
||||||
|
|
||||||
|
# %%
|
||||||
|
# analyse type numbers for each folder
|
||||||
|
folder_to_types = []
|
||||||
|
for idx, folder in enumerate(folders):
|
||||||
|
folder_types = []
|
||||||
|
for file in folder.glob("*.json"):
|
||||||
|
with open(file, "r") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
type_num = data["initial"]["dsc_TypeNumber"]["value"]
|
||||||
|
folder_types.append(type_num)
|
||||||
|
|
||||||
|
type_num_count = Counter(folder_types)
|
||||||
|
folder_to_types.append((folder.name, type_num_count))
|
||||||
|
|
||||||
|
typenum_counter = Counter()
|
||||||
|
for idx in range(len(folder_to_types)):
|
||||||
|
typenum_counter.update(folder_to_types[idx][1])
|
||||||
|
|
||||||
|
max_type_num, type_num_count = typenum_counter.most_common(1)[0]
|
||||||
|
print(f"Max type number is: {max_type_num}")
|
||||||
|
print(f"Number of occurrences: {type_num_count}")
|
||||||
|
# %%
|
||||||
|
typenum_counter
|
||||||
|
# %%
|
||||||
|
folder_to_types
|
||||||
|
# %%
|
||||||
|
# ** one file is one curve
|
||||||
|
# concatenate all files in one table
|
||||||
|
# %%
|
||||||
|
# one time-series entry is defined by its schema:
|
||||||
|
# [ts, ps, pressure, valve]
|
||||||
|
# [timestep, process_step, pressure_value, valve_value]
|
||||||
|
# valid states are ps = [101, 102, 110]
|
||||||
|
|
||||||
|
schema = {
|
||||||
|
"DU1260": pl.Float64,
|
||||||
|
"V1560": pl.Boolean,
|
||||||
|
"ps": pl.UInt32,
|
||||||
|
"ts": pl.String,
|
||||||
|
"type_num": pl.UInt8,
|
||||||
|
"id": pl.UInt64,
|
||||||
|
}
|
||||||
|
df = pl.DataFrame(schema=schema)
|
||||||
|
count = 0
|
||||||
|
for idx, file in enumerate(p_data_base.glob("**/*.json")):
|
||||||
|
with open(file, "r") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
type_num = data["initial"]["dsc_TypeNumber"]["value"]
|
||||||
|
df_file = pl.DataFrame(data["rows"], schema_overrides=schema)
|
||||||
|
df_file = df_file.with_columns(
|
||||||
|
pl.lit(type_num).alias("type_num").cast(pl.UInt8),
|
||||||
|
pl.lit(idx).alias("id").cast(pl.UInt64),
|
||||||
|
)
|
||||||
|
df = pl.concat((df, df_file))
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
df = df.with_columns(pl.col("ts").str.to_datetime(time_zone="UTC"))
|
||||||
|
df = df.select(["id", "type_num", "ts", "ps", "DU1260", "V1560"])
|
||||||
|
df.head()
|
||||||
|
# %%
|
||||||
|
print(f"Files processed: {count}")
|
||||||
|
print(f"Length of obtained data: {len(df)}")
|
||||||
|
# %%
|
||||||
|
concat_data = p_data_base / "all_data.parquet"
|
||||||
|
if WRITE_TO_DISK:
|
||||||
|
df.write_parquet(concat_data)
|
||||||
|
else:
|
||||||
|
df = pl.read_parquet(concat_data)
|
||||||
|
# %%
|
||||||
|
df.head()
|
||||||
|
print(f"Number of entries in data: {len(df)}")
|
||||||
|
print(f"Number of curves in data: {len(df.select('id').unique())}")
|
||||||
|
# %%
|
||||||
|
# valid ps = 101, 102, 110
|
||||||
|
# filter all entries which contain invalid error states
|
||||||
|
invalid_ids = df.filter(~pl.col("ps").is_in((101, 102, 110))).select("id").unique()
|
||||||
|
print(f"Number of invalid IDs: {len(invalid_ids)}")
|
||||||
|
df = df.filter(~pl.col("id").is_in(invalid_ids["id"].implode()))
|
||||||
|
print(f"Number of curves in data after cleansing: {len(df.select('id').unique())}")
|
||||||
|
# %%
|
||||||
|
df.select(["ts", "DU1260"])
|
||||||
|
# %%
|
||||||
|
df.plot.line(x="ts", y="DU1260")
|
||||||
|
# %%
|
||||||
Loading…
x
Reference in New Issue
Block a user