diff --git a/data_analysis/02-3_oracle_workflow_test.py b/data_analysis/02-3_oracle_workflow_test.py index 6c783e8..82eec9c 100644 --- a/data_analysis/02-3_oracle_workflow_test.py +++ b/data_analysis/02-3_oracle_workflow_test.py @@ -1,4 +1,5 @@ # %% +import json import time from collections.abc import Sequence from pathlib import Path @@ -165,8 +166,7 @@ print(stmt.compile(engine)) # %% # raw data query -# TODO change to left join, otherwise possible that requests are missed -# TODO after that: look for entries which do not have an associated title number +# TODO look for entries which do not have an associated title number print("--------------- ext_bedpbed --------------") t1 = time.perf_counter() @@ -199,7 +199,50 @@ elapsed = t2 - t1 print(f"Query duration: {elapsed:.4f} sec") print("Number of entries: ", len(df)) print(f"Estimated size in memory: {df.estimated_size(unit='mb')} MB") +# %% +# SAVING/LOADING +p_save = Path.cwd() / "raw_data_from_sql_query_20251202-2.arrow" +# df.write_ipc(p_save) +df = pl.read_ipc(p_save) +# %% +len(df) +df.head() +# 4591588: in title database with different MANDANT (are MANDANTFUEHR and BEDP_MAN feasible for matching?) +# %% +df.filter(pl.col("BEDP_MAN").is_in((1, 90))).filter(pl.col("MELDENUMMER")) +# %% +# !! CHECK: null values set in the query with CASE statement +print(len(df.filter(pl.col("MELDENUMMER") == 18))) +# df.filter(pl.col("MELDENUMMER") == 18).filter((pl.col("BEDP_MENGE_BEDARF_VM").is_not_null()) & (pl.col("BEDP_MENGE_BEDARF_VM") > 0)) +df.filter(pl.col("BEDP_MENGE_BEDARF_VM") > pl.col("MENGE_VORMERKER")) +# %% +# !! CHECK: titles with request where no title information is found +# not_in_title_table = df.filter(pl.col("BEDP_MAN").is_in((1, 90))).filter( +# pl.col("MELDENUMMER").is_null() +# ) +# EXPORT_FEAT = "BEDP_TITELNR" +# to_save = {EXPORT_FEAT: not_in_title_table.select(EXPORT_FEAT).to_series().to_list()} +# p_save_not_in_title_table = Path.cwd() / "not_in_title_table.json" +# with open(p_save_not_in_title_table, "w") as file: +# json.dump(to_save, file, indent=4) +# %% +# !! CHECK: different MANDANTEN +# check for valid entries for unknown MANDANTEN +# MANDANT = 80 + +# print(f"Mandant: {MANDANT}") +# print( +# df.filter(pl.col("BEDP_MAN") == MANDANT).select( +# ["BEDP_MENGE_BEDARF_VM", "MELDENUMMER", "MENGE_VORMERKER"] +# ) +# ) +# print( +# df.filter(pl.col("BEDP_MAN") == MANDANT).select( +# ["BEDP_MENGE_BEDARF_VM", "MELDENUMMER", "MENGE_VORMERKER"] +# ).null_count() +# ) +# print("Unique value counts: ", df.select(pl.col("BEDP_MAN").value_counts())) # %% # VM_CRITERION = "MENGE_VORMERKER" VM_CRITERION = "BEDP_MENGE_BEDARF_VM" @@ -221,7 +264,7 @@ def get_raw_data() -> pl.DataFrame: ).label("BEDP_MENGE_BEDARF_VM"), db.ext_titel_info.c.MELDENUMMER, db.ext_titel_info.c.MENGE_VORMERKER, - ).select_from(db.ext_bedpbed.join(db.ext_titel_info, join_condition)) + ).select_from(db.ext_bedpbed.join(db.ext_titel_info, join_condition, isouter=True)) return pl.read_database( stmt, diff --git a/data_analysis/not_in_title_table.json b/data_analysis/not_in_title_table.json new file mode 100644 index 0000000..52a7c12 --- /dev/null +++ b/data_analysis/not_in_title_table.json @@ -0,0 +1,49 @@ +{ + "BEDP_TITELNR": [ + 6132326, + 4836777, + 4836779, + 3370676, + 3370678, + 6261428, + 6261430, + 8254295, + 8139588, + 6178366, + 6178367, + 8139587, + 6178370, + 6178371, + 8139586, + 8139585, + 4837536, + 3369003, + 6132318, + 6132319, + 8254301, + 6132322, + 6132323, + 4838000, + 4838001, + 4836769, + 4836770, + 8139590, + 8139591, + 3369002, + 4837537, + 3408130, + 3408132, + 5227666, + 5227665, + 5227663, + 5227661, + 139058, + 9126790, + 5917263, + 7112355, + 1462793, + 1216207, + 507075, + 8254294 + ] +} \ No newline at end of file diff --git a/data_analysis/queries.sql b/data_analysis/queries.sql new file mode 100644 index 0000000..19e1ca3 --- /dev/null +++ b/data_analysis/queries.sql @@ -0,0 +1,39 @@ +-- SELECT count(*) FROM EXT_TITEL_INFO +-- SELECT * FROM EXT_TITEL_INFO +set timing on + +SELECT count(*) FROM EXT_BEDPBED; + +-- PROMPT No Hashing allowed... +-- SELECT /*+ NO_USE_HASH(bedp t_info) */ +-- bedp.BEDARFNR, +-- bedp.BEDP_SEQUENZ, +-- bedp.BEDP_TITELNR, +-- bedp.BEDP_MAN, +-- bedp.BEDP_MENGE_BEDARF_VM, +-- t_info.MELDENUMMER, +-- t_info.MENGE_VORMERKER +-- FROM EXT_BEDPBED bedp +-- LEFT JOIN EXT_TITEL_INFO t_info +-- ON bedp.BEDP_TITELNR = t_info.TI_NUMMER +-- AND bedp.BEDP_MAN = t_info.MANDFUEHR; +-- PROMPT #################################### + +-- PROMPT All allowed +-- SELECT +-- bedp.BEDARFNR, +-- bedp.BEDP_SEQUENZ, +-- bedp.BEDP_TITELNR, +-- bedp.BEDP_MAN, +-- bedp.BEDP_MENGE_BEDARF_VM, +-- t_info.MELDENUMMER, +-- t_info.MENGE_VORMERKER +-- FROM EXT_BEDPBED bedp +-- LEFT JOIN EXT_TITEL_INFO t_info +-- ON bedp.BEDP_TITELNR = t_info.TI_NUMMER +-- AND bedp.BEDP_MAN = t_info.MANDFUEHR; +-- -- WHERE bedp.BEDP_MAN IN (1, 90) AND t_info.MELDENUMMER != 26; +-- PROMPT ###################################### + +-- SELECT * FROM EXT_TITEL_INFO t_info WHERE t_info.TI_NUMMER = 6132326; +SELECT * FROM EXT_TITEL_INFO t_info WHERE t_info.TI_NUMMER = 4591588; diff --git a/data_analysis/raw_data_from_sql_query_20251202-1.arrow b/data_analysis/raw_data_from_sql_query_20251202-1.arrow new file mode 100644 index 0000000..c77c22e Binary files /dev/null and b/data_analysis/raw_data_from_sql_query_20251202-1.arrow differ diff --git a/data_analysis/raw_data_from_sql_query_20251202-2.arrow b/data_analysis/raw_data_from_sql_query_20251202-2.arrow new file mode 100644 index 0000000..f1e04ae Binary files /dev/null and b/data_analysis/raw_data_from_sql_query_20251202-2.arrow differ