From d5546b7fd01cf58bc71e911ce5085b5faeec8828 Mon Sep 17 00:00:00 2001 From: foefl Date: Tue, 13 Jan 2026 16:13:22 +0100 Subject: [PATCH] basic data analysis --- pdm.lock | 75 ++++++++++++++++- prototypes/01_first_analyse.py | 142 ++++++++++++++++++++++++++++++--- pyproject.toml | 1 + 3 files changed, 207 insertions(+), 11 deletions(-) diff --git a/pdm.lock b/pdm.lock index e823467..3c9fabb 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "data", "dev", "lint", "nb", "tests"] strategy = ["inherit_metadata"] lock_version = "4.5.0" -content_hash = "sha256:17e3ecabeaf176fccc05c9f1c4d567ce29a09cf0cf905e53f399e4319116285c" +content_hash = "sha256:c32739e18120fad1a688e3030b22d6e3a8a1d1412f5f16295cab4ea173479591" [[metadata.targets]] requires_python = ">=3.11" @@ -2935,6 +2935,79 @@ files = [ {file = "ruff-0.14.11.tar.gz", hash = "sha256:f6dc463bfa5c07a59b1ff2c3b9767373e541346ea105503b4c0369c520a66958"}, ] +[[package]] +name = "scipy" +version = "1.17.0" +requires_python = ">=3.11" +summary = "Fundamental algorithms for scientific computing in Python" +groups = ["data"] +dependencies = [ + "numpy<2.7,>=1.26.4", +] +files = [ + {file = "scipy-1.17.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:2abd71643797bd8a106dff97894ff7869eeeb0af0f7a5ce02e4227c6a2e9d6fd"}, + {file = "scipy-1.17.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:ef28d815f4d2686503e5f4f00edc387ae58dfd7a2f42e348bb53359538f01558"}, + {file = "scipy-1.17.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:272a9f16d6bb4667e8b50d25d71eddcc2158a214df1b566319298de0939d2ab7"}, + {file = "scipy-1.17.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:7204fddcbec2fe6598f1c5fdf027e9f259106d05202a959a9f1aecf036adc9f6"}, + {file = "scipy-1.17.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fc02c37a5639ee67d8fb646ffded6d793c06c5622d36b35cfa8fe5ececb8f042"}, + {file = "scipy-1.17.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dac97a27520d66c12a34fd90a4fe65f43766c18c0d6e1c0a80f114d2260080e4"}, + {file = "scipy-1.17.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ebb7446a39b3ae0fe8f416a9a3fdc6fba3f11c634f680f16a239c5187bc487c0"}, + {file = "scipy-1.17.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:474da16199f6af66601a01546144922ce402cb17362e07d82f5a6cf8f963e449"}, + {file = "scipy-1.17.0-cp311-cp311-win_amd64.whl", hash = "sha256:255c0da161bd7b32a6c898e7891509e8a9289f0b1c6c7d96142ee0d2b114c2ea"}, + {file = "scipy-1.17.0-cp311-cp311-win_arm64.whl", hash = "sha256:85b0ac3ad17fa3be50abd7e69d583d98792d7edc08367e01445a1e2076005379"}, + {file = "scipy-1.17.0-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:0d5018a57c24cb1dd828bcf51d7b10e65986d549f52ef5adb6b4d1ded3e32a57"}, + {file = "scipy-1.17.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:88c22af9e5d5a4f9e027e26772cc7b5922fab8bcc839edb3ae33de404feebd9e"}, + {file = "scipy-1.17.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:f3cd947f20fe17013d401b64e857c6b2da83cae567adbb75b9dcba865abc66d8"}, + {file = "scipy-1.17.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e8c0b331c2c1f531eb51f1b4fc9ba709521a712cce58f1aa627bc007421a5306"}, + {file = "scipy-1.17.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5194c445d0a1c7a6c1a4a4681b6b7c71baad98ff66d96b949097e7513c9d6742"}, + {file = "scipy-1.17.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9eeb9b5f5997f75507814ed9d298ab23f62cf79f5a3ef90031b1ee2506abdb5b"}, + {file = "scipy-1.17.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:40052543f7bbe921df4408f46003d6f01c6af109b9e2c8a66dd1cf6cf57f7d5d"}, + {file = "scipy-1.17.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0cf46c8013fec9d3694dc572f0b54100c28405d55d3e2cb15e2895b25057996e"}, + {file = "scipy-1.17.0-cp312-cp312-win_amd64.whl", hash = "sha256:0937a0b0d8d593a198cededd4c439a0ea216a3f36653901ea1f3e4be949056f8"}, + {file = "scipy-1.17.0-cp312-cp312-win_arm64.whl", hash = "sha256:f603d8a5518c7426414d1d8f82e253e454471de682ce5e39c29adb0df1efb86b"}, + {file = "scipy-1.17.0-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:65ec32f3d32dfc48c72df4291345dae4f048749bc8d5203ee0a3f347f96c5ce6"}, + {file = "scipy-1.17.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:1f9586a58039d7229ce77b52f8472c972448cded5736eaf102d5658bbac4c269"}, + {file = "scipy-1.17.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:9fad7d3578c877d606b1150135c2639e9de9cecd3705caa37b66862977cc3e72"}, + {file = "scipy-1.17.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:423ca1f6584fc03936972b5f7c06961670dbba9f234e71676a7c7ccf938a0d61"}, + {file = "scipy-1.17.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fe508b5690e9eaaa9467fc047f833af58f1152ae51a0d0aed67aa5801f4dd7d6"}, + {file = "scipy-1.17.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6680f2dfd4f6182e7d6db161344537da644d1cf85cf293f015c60a17ecf08752"}, + {file = "scipy-1.17.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:eec3842ec9ac9de5917899b277428886042a93db0b227ebbe3a333b64ec7643d"}, + {file = "scipy-1.17.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d7425fcafbc09a03731e1bc05581f5fad988e48c6a861f441b7ab729a49a55ea"}, + {file = "scipy-1.17.0-cp313-cp313-win_amd64.whl", hash = "sha256:87b411e42b425b84777718cc41516b8a7e0795abfa8e8e1d573bf0ef014f0812"}, + {file = "scipy-1.17.0-cp313-cp313-win_arm64.whl", hash = "sha256:357ca001c6e37601066092e7c89cca2f1ce74e2a520ca78d063a6d2201101df2"}, + {file = "scipy-1.17.0-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:ec0827aa4d36cb79ff1b81de898e948a51ac0b9b1c43e4a372c0508c38c0f9a3"}, + {file = "scipy-1.17.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:819fc26862b4b3c73a60d486dbb919202f3d6d98c87cf20c223511429f2d1a97"}, + {file = "scipy-1.17.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:363ad4ae2853d88ebcde3ae6ec46ccca903ea9835ee8ba543f12f575e7b07e4e"}, + {file = "scipy-1.17.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:979c3a0ff8e5ba254d45d59ebd38cde48fce4f10b5125c680c7a4bfe177aab07"}, + {file = "scipy-1.17.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:130d12926ae34399d157de777472bf82e9061c60cc081372b3118edacafe1d00"}, + {file = "scipy-1.17.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6e886000eb4919eae3a44f035e63f0fd8b651234117e8f6f29bad1cd26e7bc45"}, + {file = "scipy-1.17.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:13c4096ac6bc31d706018f06a49abe0485f96499deb82066b94d19b02f664209"}, + {file = "scipy-1.17.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:cacbaddd91fcffde703934897c5cd2c7cb0371fac195d383f4e1f1c5d3f3bd04"}, + {file = "scipy-1.17.0-cp313-cp313t-win_amd64.whl", hash = "sha256:edce1a1cf66298cccdc48a1bdf8fb10a3bf58e8b58d6c3883dd1530e103f87c0"}, + {file = "scipy-1.17.0-cp313-cp313t-win_arm64.whl", hash = "sha256:30509da9dbec1c2ed8f168b8d8aa853bc6723fede1dbc23c7d43a56f5ab72a67"}, + {file = "scipy-1.17.0-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:c17514d11b78be8f7e6331b983a65a7f5ca1fd037b95e27b280921fe5606286a"}, + {file = "scipy-1.17.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:4e00562e519c09da34c31685f6acc3aa384d4d50604db0f245c14e1b4488bfa2"}, + {file = "scipy-1.17.0-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:f7df7941d71314e60a481e02d5ebcb3f0185b8d799c70d03d8258f6c80f3d467"}, + {file = "scipy-1.17.0-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:aabf057c632798832f071a8dde013c2e26284043934f53b00489f1773b33527e"}, + {file = "scipy-1.17.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a38c3337e00be6fd8a95b4ed66b5d988bac4ec888fd922c2ea9fe5fb1603dd67"}, + {file = "scipy-1.17.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00fb5f8ec8398ad90215008d8b6009c9db9fa924fd4c7d6be307c6f945f9cd73"}, + {file = "scipy-1.17.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f2a4942b0f5f7c23c7cd641a0ca1955e2ae83dedcff537e3a0259096635e186b"}, + {file = "scipy-1.17.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:dbf133ced83889583156566d2bdf7a07ff89228fe0c0cb727f777de92092ec6b"}, + {file = "scipy-1.17.0-cp314-cp314-win_amd64.whl", hash = "sha256:3625c631a7acd7cfd929e4e31d2582cf00f42fcf06011f59281271746d77e061"}, + {file = "scipy-1.17.0-cp314-cp314-win_arm64.whl", hash = "sha256:9244608d27eafe02b20558523ba57f15c689357c85bdcfe920b1828750aa26eb"}, + {file = "scipy-1.17.0-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:2b531f57e09c946f56ad0b4a3b2abee778789097871fc541e267d2eca081cff1"}, + {file = "scipy-1.17.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:13e861634a2c480bd237deb69333ac79ea1941b94568d4b0efa5db5e263d4fd1"}, + {file = "scipy-1.17.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:eb2651271135154aa24f6481cbae5cc8af1f0dd46e6533fb7b56aa9727b6a232"}, + {file = "scipy-1.17.0-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:c5e8647f60679790c2f5c76be17e2e9247dc6b98ad0d3b065861e082c56e078d"}, + {file = "scipy-1.17.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5fb10d17e649e1446410895639f3385fd2bf4c3c7dfc9bea937bddcbc3d7b9ba"}, + {file = "scipy-1.17.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8547e7c57f932e7354a2319fab613981cde910631979f74c9b542bb167a8b9db"}, + {file = "scipy-1.17.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:33af70d040e8af9d5e7a38b5ed3b772adddd281e3062ff23fec49e49681c38cf"}, + {file = "scipy-1.17.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:f9eb55bb97d00f8b7ab95cb64f873eb0bf54d9446264d9f3609130381233483f"}, + {file = "scipy-1.17.0-cp314-cp314t-win_amd64.whl", hash = "sha256:1ff269abf702f6c7e67a4b7aad981d42871a11b9dd83c58d2d2ea624efbd1088"}, + {file = "scipy-1.17.0-cp314-cp314t-win_arm64.whl", hash = "sha256:031121914e295d9791319a1875444d55079885bbae5bdc9c5e0f2ee5f09d34ff"}, + {file = "scipy-1.17.0.tar.gz", hash = "sha256:2591060c8e648d8b96439e111ac41fd8342fdeff1876be2e19dea3fe8930454e"}, +] + [[package]] name = "seaborn" version = "0.13.2" diff --git a/prototypes/01_first_analyse.py b/prototypes/01_first_analyse.py index d5c8883..7362923 100644 --- a/prototypes/01_first_analyse.py +++ b/prototypes/01_first_analyse.py @@ -2,12 +2,13 @@ import json import pprint from collections import Counter +from datetime import datetime from pathlib import Path +from zoneinfo import ZoneInfo import pandas as pd import polars as pl - -WRITE_TO_DISK = False +from scipy import stats # %% p_data_base = (Path.cwd() / "../data/Datenauszug_20251212").resolve() @@ -75,7 +76,7 @@ folder_to_types # [timestep, process_step, pressure_value, valve_value] # valid states are ps = [101, 102, 110] -schema = { +schema_read = { "DU1260": pl.Float64, "V1560": pl.Boolean, "ps": pl.UInt32, @@ -83,37 +84,60 @@ schema = { "type_num": pl.UInt8, "id": pl.UInt64, } -df = pl.DataFrame(schema=schema) +schema = { + "DU1260": pl.Float64, + "V1560": pl.Boolean, + "ps": pl.UInt32, + "ts": pl.Datetime, + "type_num": pl.UInt8, + "id": pl.UInt64, + "ts_delta_step": pl.Duration, + "ts_delta_cum": pl.Duration, +} +df = pl.DataFrame(schema=schema).with_columns(pl.col("ts").dt.replace_time_zone("UTC")) count = 0 for idx, file in enumerate(p_data_base.glob("**/*.json")): with open(file, "r") as f: data = json.load(f) type_num = data["initial"]["dsc_TypeNumber"]["value"] - df_file = pl.DataFrame(data["rows"], schema_overrides=schema) + df_file = pl.DataFrame(data["rows"], schema_overrides=schema_read) df_file = df_file.with_columns( + pl.col("ts").str.to_datetime(time_zone="UTC"), pl.lit(type_num).alias("type_num").cast(pl.UInt8), pl.lit(idx).alias("id").cast(pl.UInt64), ) + df_file = df_file.with_columns( + (pl.col.ts - pl.col.ts.shift(1)) + .alias("ts_delta_step") + .fill_null(pl.lit(0).cast(pl.Duration)) + ) + df_file = df_file.with_columns( + pl.col("ts_delta_step").cum_sum().alias("ts_delta_cum"), + ) df = pl.concat((df, df_file)) count += 1 -df = df.with_columns(pl.col("ts").str.to_datetime(time_zone="UTC")) -df = df.select(["id", "type_num", "ts", "ps", "DU1260", "V1560"]) +# df = df.with_columns(pl.col("ts").str.to_datetime(time_zone="UTC")) +df = df.select( + ["id", "type_num", "ts", "ts_delta_step", "ts_delta_cum", "ps", "DU1260", "V1560"] +) df.head() # %% print(f"Files processed: {count}") print(f"Length of obtained data: {len(df)}") # %% +WRITE_TO_DISK = False + concat_data = p_data_base / "all_data.parquet" if WRITE_TO_DISK: df.write_parquet(concat_data) else: df = pl.read_parquet(concat_data) # %% -df.head() print(f"Number of entries in data: {len(df)}") print(f"Number of curves in data: {len(df.select('id').unique())}") +df.head() # %% # valid ps = 101, 102, 110 # filter all entries which contain invalid error states @@ -121,8 +145,106 @@ invalid_ids = df.filter(~pl.col("ps").is_in((101, 102, 110))).select("id").uniqu print(f"Number of invalid IDs: {len(invalid_ids)}") df = df.filter(~pl.col("id").is_in(invalid_ids["id"].implode())) print(f"Number of curves in data after cleansing: {len(df.select('id').unique())}") +# sort chronologically +df = df.sort(by=["id", "ts"], descending=[False, False]) # %% -df.select(["ts", "DU1260"]) +# filter for relevant type number with maximum number of entries +TARGET_TYPE_NUM = 2 +df = df.filter(pl.col.type_num == TARGET_TYPE_NUM) +print(f"Number of entries for type num {TARGET_TYPE_NUM}: {len(df)}") +print(f"Number of curves in data: {len(df.select('id').unique())}") # %% -df.plot.line(x="ts", y="DU1260") +current_time = datetime.now(tz=ZoneInfo("UTC")) +df_reconst = df.with_columns( + (pl.col.ts_delta_cum + pl.lit(current_time)).alias("reconstructed") +) +# %% +df_reconst +# %% +collection = df_reconst.select(pl.col.id).unique().sort(by="id")["id"][:10] +# %% +series = df_reconst.filter(pl.col.id.is_in(collection)) +series +# %% +series.select(pl.exclude("ts_delta_step", "ts_delta_cum")).plot.line( + x="reconstructed", y="DU1260" +) + +# %% +series.group_by("id").agg(pl.col("ts_delta_cum").max()) +# %% +series.group_by("id").agg(pl.len()) + +# ** simple stats +# try to separate anomalies by time/duration +# // "Duration Anomalies" +# IQR +durations = df_reconst.group_by("id").agg(pl.col("ts_delta_cum").max()) +durations = durations.with_columns(pl.col.ts_delta_cum.dt.total_microseconds()) +durations.head() + +FACTOR = 1.5 +iqr = stats.iqr(durations["ts_delta_cum"]) +quantiles = stats.quantile(durations["ts_delta_cum"], [0.25, 0.75]) +print(f"Quantiles (0.25, 0.75): {quantiles}") +print(f"IQR: {iqr}") +iqr_lb = max(iqr - FACTOR * quantiles[0], 0) +iqr_ub = iqr + FACTOR * quantiles[1] +print(f"Lower bound: {iqr_lb}") +print(f"Upper bound: {iqr_ub}") +durations.describe() +# %% +df_reconst.filter(pl.col.ps == 102).filter( + pl.col.ts_delta_cum > pl.duration(microseconds=iqr_ub) +) +# %% +filter_out_time = ( + df_reconst.filter(pl.col.ts_delta_cum > pl.duration(microseconds=iqr_ub)) + .select("id") + .unique() +) +df_out_time = df_reconst.filter(pl.col.id.is_in(filter_out_time["id"].implode())) +df_out_time +# TODO calculate duration for each phase +ids_out = df_out_time["id"].unique().implode() +df_remain = df_reconst.filter(~pl.col.id.is_in(ids_out)) +df_remain +# %% +df_analyse = ( + df_remain.group_by("id") + .agg(pl.len().alias("count"), pl.col("ts_delta_cum").max()) + .with_columns( + (pl.col.count / pl.col.ts_delta_cum.dt.total_microseconds()).alias( + "mean_sampling_rate" + ) + ) +) +# %% +df_analyse.describe() +# %% +df_analyse2 = ( + df_reconst.group_by("id") + .agg(pl.len().alias("count"), pl.col("ts_delta_cum").max()) + .with_columns( + (pl.col.count / pl.col.ts_delta_cum.dt.total_microseconds()).alias( + "mean_sampling_rate" + ) + ) +) +df_analyse2.describe() +# %% +df2 +# %% +series +# %% +# %% + +series.head() +# %% +temp = df.filter(pl.col.id.is_in(collection)) +temp +# %% +temp = temp.with_columns((pl.col.ts_delta + pl.lit(current_time)).alias("reconstructed")) +# %% +temp # %% diff --git a/pyproject.toml b/pyproject.toml index 17dec88..1ab3c4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -153,4 +153,5 @@ data = [ "polars>=1.37.1", "seaborn>=0.13.2", "altair>=6.0.0", + "scipy>=1.17.0", ]