drop instead of select

This commit is contained in:
2026-06-08 07:48:09 +02:00
parent 56b88adac3
commit c0cb16a893

View File

@@ -73,7 +73,6 @@ def preprocess_psm(
# drop duplicates # drop duplicates
# use null count as information measure, least amount of nulls should be contained # use null count as information measure, least amount of nulls should be contained
base_columns = data.columns
data = data.with_columns(pl.sum_horizontal(pl.all().is_null()).alias("null_count")) data = data.with_columns(pl.sum_horizontal(pl.all().is_null()).alias("null_count"))
data = data.sort(PRIM_KEYS + ["Meldezeitpunkt_Historie", "null_count"], descending=False) data = data.sort(PRIM_KEYS + ["Meldezeitpunkt_Historie", "null_count"], descending=False)
filtered_data = pl.concat( filtered_data = pl.concat(
@@ -81,7 +80,7 @@ def preprocess_psm(
filtered_data, filtered_data,
data.filter( data.filter(
~pl.struct(PRIM_KEYS + ["Meldezeitpunkt_Historie"]).is_first_distinct() ~pl.struct(PRIM_KEYS + ["Meldezeitpunkt_Historie"]).is_first_distinct()
).select(base_columns), ).drop("null_count"),
] ]
) )
data = data.filter(pl.struct(PRIM_KEYS + ["Meldezeitpunkt_Historie"]).is_first_distinct()) data = data.filter(pl.struct(PRIM_KEYS + ["Meldezeitpunkt_Historie"]).is_first_distinct())