Coverage for lib/manaudit/lib_datou_audit.py: 4%
1042 statements
« prev ^ index » next coverage.py v7.9.1, created at 2026-02-10 01:10 +0100
« prev ^ index » next coverage.py v7.9.1, created at 2026-02-10 01:10 +0100
1import datetime
2import types
4import pandas.core.dtypes.generic
7# on pourrait extraire le datou des audit_info : il est juste au debut Ah ha ah !
8# Mais on a pas de fonction pour construire le datou à partir des json
9# on pourrait aussi extraire les man_audit_info du datou plutot que du audit, enfin bon !
10# il faut nécessairement chercher dans les datou, ou au moins le dict qui le représente le param json utile
12def parse_audit_info(audit_info, list_action_for_df_to_correct = [], map_modif_df_manual = {}):
13 try:
14 # En fait il suffit de trouver load_tab
15 if "io_exec" not in audit_info:
16 return {}, {}
18 panda_table_content_as_markdown = ""
19 panda_table_content_as_json = {}
20 data_to_parse_for_json = ""
21 suffix = ""
22 nb_manual_action_df_for_col_audit = 0
24 map_col_id_to_id_df = {}
26 for steps in audit_info["io_exec"]:
27 if not steps.isdigit():
28 continue
29 type_step = audit_info["io_exec"][steps]["datou_step"]
30 param_json = audit_info["io_exec"][steps]["param_json"]
31 input = audit_info["io_exec"][steps]["input"]
32 output = audit_info["io_exec"][steps]["output"]
33 if type_step == "load_tab":
34 if "assoc" in param_json:
35 print("Will maybe faile, we need to find the right assoc")
36 if panda_table_content_as_markdown != "":
37 print("Will maybe fail, since we have two load tab step")
38 panda_table_content_as_markdown = input["result"] if "result" in input else ""
39 if type_step == "image_to_text":
40 suffix = "google_ocr"
41 # param_json =
42 if type_step == "map_reduce":
43 print("We want the second map_reduce, like id_step 7, ou peut-etre n'importe laquelle le champt result de input ?? ")
44 if "text" in output:
45 data_to_parse_for_json = output["text"]
46 if type_step == "format":
47 if "df_complet_as_markdown" in output:
48 panda_table_content_as_markdown = output["df_complet_as_markdown"]
49 if "df_complet_as_json" in output:
50 panda_table_content_as_json = output["df_complet_as_json"]
51 pass
53 from lib.batch.lib_batch import create_pandas_table_from_text
55 df = create_pandas_table_from_text(panda_table_content_as_markdown)
56 import pandas as pd
57 if panda_table_content_as_json != {} and type(panda_table_content_as_json) != types.NoneType and panda_table_content_as_json != "":
58 df = pd.read_json(panda_table_content_as_json, convert_dates=["datet", "date_entree_hospitalisationt", "date_sortie_hospitalisationt", "date_fin_arret_travailt", "date_debut_arret_travailt"])
59 if type(df) == type(None):
60 print("pd none from : " + str(panda_table_content_as_markdown))
61 df = pd.DataFrame(columns=["Null Column"])
62 else:
63 df["intro_correct"] = "Le ..."
64 df["cr_correct"] = "Le ..."
65 df["cr_correct_typo"] = ""
66 df["intro_correct_typo"] = ""
67 df["cr_front_init"] = "init"
68 df["intro_front_init"] = "init"
69 df["auto_state"] = """<input type="checkbox" value="" name="bordered-checkbox" class="w-4 h-4 text-blue-600 bg-gray-100 border-gray-300 rounded focus:ring-blue-500 dark:focus:ring-blue-600 dark:ring-offset-gray-800 focus:ring-2 dark:bg-gray-700 dark:border-gray-600">Automation impossible"""
70 df["auto_state_val"] = "false"
71 df["class_vert_rouge_valider"] = "false"
73 from lib.lib_util import parse_json_from_prompt_result, \
74 complete_date_and_order_json_to_mettre_en_forme, append_id_by_order
75 list_json_to_mettre_en_forme = parse_json_from_prompt_result(data_to_parse_for_json)
76 list_json_to_mettre_en_forme = append_id_by_order(list_json_to_mettre_en_forme)
78 # if order_by_date:
79 # list_json_to_mettre_en_forme = complete_date_and_order_json_to_mettre_en_forme(list_json_to_mettre_en_forme)
81 from lib.lib_util import add_parsing_meta_info_to_table
82 # VR 29-3-24 : a mon avis cela a deja été réalisé
83# df = add_parsing_meta_info_to_table(df, list_json_to_mettre_en_forme)
85 if len(list_action_for_df_to_correct) > 0:
86 for action in list_action_for_df_to_correct:
87 type_action = action["type_action"] if "type_action" in action else "default"
88 if type_action == "df_meta_info_correct":
89 key_action = create_key_action(action)
90 count = (map_modif_df_manual[key_action]["count"] + 1) if key_action in map_modif_df_manual and "count" in map_modif_df_manual[key_action] else 1
91 map_modif_df_manual[key_action] = action
92 map_modif_df_manual[key_action]["count"] = count
93 map_modif_df_manual[key_action]["audit_or_datou"] = "audit"
95 id_line = action["id"]
96 if id_line == 'NaN':
97 continue
98 manual_value = action["manual_value"]
99 col_name = action["col_name"]
100 # refacto renommer front_value et rajouter un
101 manual = bool(action["manual"]) if "manual" in action else False
102 if manual :
103 nb_manual_action_df_for_col_audit += 1
104 #df.loc[int(id_line)][col_name] = manual_value
105 try :
106 # maanging incomplete parsing of document => not clear why it creates this but there are some link in audit bug
107 id_line_int = int(str(id_line).replace(".0", ""))
108 except Exception as e:
109 print(" Pb with action id_line : " + str(id_line) + " " + str(e) + " parsing as float then int !")
110 if id_line != 'None':
111 print("ERROR treated as warning need to be audited")
112 id_line_int = int(float(id_line))
113 else :
114 continue
116 if map_col_id_to_id_df == {}:
117 for i in range(len(df)):
118 idx_df = df.index[i]
119 idx_from_data = df.loc[idx_df]["id"]
120 if str(idx_from_data).replace(".0", "").isdigit():
121 map_col_id_to_id_df[str(idx_from_data).replace(".0", "")] = idx_df
122 else :
123 print("ERROR should fail, wrong data will be saved ! " + str(idx_from_data))
124 if str(idx_df).isdigit():
125 map_col_id_to_id_df[str(idx_df)] = idx_df
126 if str(id_line).replace(".0", "") in map_col_id_to_id_df:
127 id_line_df = map_col_id_to_id_df[str(id_line).replace(".0", "")]
128 # df.iloc[id_line_df][col_name] = manual_value
129 df.loc[id_line_df, col_name] = manual_value
130 # df.loc[:,('one','second')]
131 else:
132 print(" Missing id line in df : " + str(id_line) + " in " + str(map_col_id_to_id_df.keys()))
134# df.loc[id_line_int, col_name] = manual_value
135 else:
136 print("type action not supported : " + str(type_action))
138 df_to_html = df.to_html(classes="table pdt-table table-striped", escape=False)
140 # TODO VR : a mettre dans la html clairement ! contrairement à ce que dit chat gpt
141 return {"dataframe_html" : df_to_html,
142 "dataframe_text" : panda_table_content_as_markdown,
143 "suffix" : suffix,
144 "df" : df,
145 "paragraphs" : []}, {"nb_manual_action_df_for_col_audit" : nb_manual_action_df_for_col_audit, "map_modif_df_manual" : map_modif_df_manual}
146 except Exception as e:
147 print(str(e))
148 return {"dataframe_html": None,
149 "dataframe_text": "",
150 "suffix": ""}, {"nb_manual_action_df_for_col_audit" : nb_manual_action_df_for_col_audit, "map_modif_df_manual" : map_modif_df_manual}
152# dataframe_html = df.to_html()
154def load_sub_json(main_json, key_list_slash):
155 try:
156 if key_list_slash == "":
157 return main_json
158 separator = "/"
159 # if "/" not in key_list_slash and "." in key_list_slash:
160 # separator = "."
161 list_keys = key_list_slash.lstrip(separator).split(separator)
162 sub_json = main_json
163 for k in list_keys:
164 if type(sub_json) == list:
165 if not k.isdigit():
166 print("Unexpected index for array in load_sub_json : " + str(k))
167 return None
168 k_int = int(k)
169 if k_int >= len(sub_json):
170 print("Not enough value in array in load_sub_json")
171 return None
172 sub_json = sub_json[k_int]
173 elif k in sub_json: # expected type dict
174 sub_json = sub_json[k]
175 # if type(sub_json) not in (dict, list):
176 # return sub_json
177 else:
178 # Dict with key as integer, so we need to convert k to int used in io_exec
179 if k.isdigit():
180 if int(k) in sub_json:
181 sub_json = sub_json[int(k)]
182 else:
183 print(" missing key " + k + " in " + str(sub_json.keys()))
184 return None
185 return sub_json
186 except Exception as e:
187 print("Exception in load_sub_json")
188 print(str(e))
189 return None
191def set_sub_json(main_json, key_list_slash, value):
192 separator = "/"
193# if "/" not in key_list_slash and "." in key_list_slash:
194# separator = "."
195 list_keys = key_list_slash.lstrip(separator).split(separator)
196 if key_list_slash.lstrip(separator) == "":
197 return value
198 sub_json = main_json
199 for k in list_keys[:-1]:
200 if type(sub_json) == list:
201 if not k.isdigit():
202 print("Unexpected index for array in load_sub_json : " + str(k))
203 return None
204 k_int = int(k)
205 if k_int >= len(sub_json):
206 print("Not enough value in array in load_sub_json")
207 return None
208 sub_json = sub_json[k_int]
209 elif k in sub_json:
210 sub_json = sub_json[k]
211 else:
212 print(" missing key " + k + " in " + str(sub_json.keys()))
213 sub_json[list_keys[-1]] = value
214 return main_json
216def create_key_action(action):
217 if "type_action" not in action or action["type_action"] != "df_meta_info_correct" or "col_name" not in action or "id" not in action:
218 return None
219 return str(action["id"]) + "_" + str(action["col_name"])
221def is_numericf(input):
222 if input == "NaN":
223 return False
224 try:
225 float(input)
226 return True
227 except ValueError:
228 return False
229 except Exception as e:
230 print("Unexpected error treated as warning")
231 print(str(e))
232 return False
236def count_time_lab_by_user(all_result_manual_correct, delta_min_between_save_minutes = 10):
237 map_user_id_time_modif = {}
238 try:
239 for result in all_result_manual_correct:
240 user_id = result["user_id"] if "user_id" in result else 0
241 if user_id not in map_user_id_time_modif:
242 map_user_id_time_modif[user_id] = []
243 if "created_at" in result and type(result["created_at"]) == datetime.datetime:
244 map_user_id_time_modif[user_id].append(result["created_at"])
245 except Exception as e:
246 print(str(e))
248 map_result_time_by_user = {}
250 for user_id in map_user_id_time_modif:
251 map_user_id_time_modif[user_id] = sorted(list(set(map_user_id_time_modif[user_id])))
252 count_minutes = 0
253 list_intervals = []
254 previous_time = None
255 for time_modif in map_user_id_time_modif[user_id]:
256 if previous_time == None:
257 previous_time = time_modif
258 list_intervals.append([time_modif, time_modif])
259 else:
260 delta = time_modif - previous_time
261 delta_in_minutes = delta.total_seconds() / 60.0
262 if delta_in_minutes <= delta_min_between_save_minutes:
263 # same interval
264 list_intervals[-1][1] = time_modif
265 else:
266 list_intervals.append([time_modif, time_modif])
267 previous_time = time_modif
269 for interval in list_intervals:
270 delta = interval[1] - interval[0]
271 delta_in_minutes = delta.total_seconds() / 60.0
272 count_minutes += delta_in_minutes + delta_min_between_save_minutes
273 map_result_time_by_user[user_id] = {"intervals" : list_intervals,
274 "total_minutes" : count_minutes}
276 return map_result_time_by_user
280def list_action_by_user(all_result_manual_correct, df_complete):
281 map_user_id_list_pages = {}
282 map_user_id_list_pages_for_split = {}
283 map_user_id_time_modif = {}
284 try:
285 for result in all_result_manual_correct:
286 if "manual_input_info" not in result or "list_actions" not in result["manual_input_info"]:
287 print("Warning missing actions in result : " + str(result))
288 continue
289 user_id = result["user_id"] if "user_id" in result else 0
290 if user_id not in map_user_id_list_pages:
291 map_user_id_list_pages[user_id] = []
292 map_user_id_time_modif[user_id] = []
293 map_user_id_list_pages_for_split[user_id] = []
294 if "created_at" in result and type(result["created_at"]) == datetime.datetime:
295 map_user_id_time_modif[user_id].append(result["created_at"])
296 for action in result["manual_input_info"]["list_actions"]:
297 if "id_page" in action:
298 if not str(action["id_page"]).isdigit():
299 print("How come this id_page isn't an int : " + str(action["id_page"]))
300 continue
301 id_page = int(action["id_page"])
302 if id_page not in map_user_id_list_pages[user_id]:
303 map_user_id_list_pages[user_id].append(id_page)
304 if "col_name" in action and action["col_name"] == "Liste des pages":
305 if id_page not in map_user_id_list_pages_for_split[user_id]:
306 map_user_id_list_pages_for_split[user_id].append(id_page)
307 elif "id" not in action or not is_numericf(str(action["id"])):
308 print("Warning missing id in action : " + str(action))
309 continue
310 else:
311 id = int(float(action["id"]))
312 list_pages = df_complete.loc[df_complete["id"] == id, "Liste des pages"].values[0] if "Liste des pages" in df_complete.columns else ""
313 if list_pages == "" or not list_pages.replace(",", "").isdigit():
314 print("Warning missing list_pages in df_complete for id : " + str(id))
315 continue
316 list_pages_as_list_int = [int(x) for x in list_pages.split(",") if x.strip().isdigit()]
317 map_user_id_list_pages[user_id].extend(list_pages_as_list_int)
318 if "col_name" in action and action["col_name"] == "Liste des pages":
319 map_user_id_list_pages_for_split[user_id].extend(list_pages_as_list_int)
320 for user_id in map_user_id_list_pages:
321 map_user_id_list_pages[user_id] = sorted(list(set(map_user_id_list_pages[user_id])))
322 for user_id in map_user_id_list_pages_for_split:
323 map_user_id_list_pages_for_split[user_id] = sorted(list(set(map_user_id_list_pages_for_split[user_id])))
324 except Exception as e:
325 print(str(e))
327 return map_user_id_list_pages, map_user_id_time_modif, map_user_id_list_pages_for_split
329def modify_audit_info_from_manual_correction(audit_info, all_result_manual_correct,
330 manual_action_to_audit_data = {},
331 project_id = None,
332 lib_user_data_internal = None):
333 if manual_action_to_audit_data == {}:
334 if lib_user_data_internal == None:
335 print (" We need access to database to get the conf_project and saxia param")
336 else :
337 print(" We will load the configuration saxia from the project_id")
338 if project_id == None:
339 print("Error project_id is None and no manual_action_to_audit_data")
341 # We can still try to get the project_id from the hash_id_token, but won't do it right now
342 hash_id_treatment = audit_info["config"]["complete_param_json"]["hash_id_treatment"] if "config" in audit_info and "complete_param_json" in audit_info["config"] and "hash_id_treatment" in audit_info["config"]["complete_param_json"] else None
343 all_result = lib_user_data_internal.load_data_audit(hash_id_treatment=hash_id_treatment)
344 # print(" all_result : " + str(all_result.keys()))
345 project_id = all_result["project_id"] if "project_id" in all_result else None
347 if project_id == None:
348 print(" We can't do anything")
349 else :
350 conf_project = lib_user_data_internal.load_conf_project(project_id)
351 saxia_conf = conf_project["saxia"] if "saxia" in conf_project else {}
352 assoc_conf = saxia_conf["assoc"] if "assoc" in saxia_conf else {}
353 manual_action_to_audit_data = assoc_conf["manual_action_to_audit_data"] if "manual_action_to_audit_data" in assoc_conf else {}
355 nb_modif_class_manual = 0
356 nb_manual_action_df = 0
357 map_modif_df_manual = {} # permettrait d'ailleurs une projection
358 paragraphs = []
359 df_auto = None
360 try:
361 # key_as_list_slash = "io_exec/0/output/paragraphs"
362 key_as_list_slash = "io_exec/3/input/paragraphs"
363 key_as_list_slash = manual_action_to_audit_data["class_paragraph"]["audit_data"] if "class_paragraph" in manual_action_to_audit_data and "audit_data" in manual_action_to_audit_data["class_paragraph"] else key_as_list_slash
364 audit_info_corrected = audit_info
366 map_col_id_to_id_df = {}
369 list_action_for_df_to_correct = []
370 paragraphs = load_sub_json(audit_info_corrected, key_as_list_slash)
371 # order modification by increasing date
372 all_result_manual_correct_ordered = sorted(all_result_manual_correct, key=lambda x: x["created_at"])
373 map_count_modif_per_doc = {}
374 map_modif_type_document = {}
375 for result in all_result_manual_correct_ordered:
376 created_at_as_string = result["created_at"].strftime("%y%m%d_%H:%m:%S")
377 print(" Modification from created_at_as_string :" + created_at_as_string + " result id : " + str(result["id"]))
378 manual_input_info = result["manual_input_info"] if "manual_input_info" in result else {}
379 list_actions = manual_input_info["list_actions"] if "list_actions" in manual_input_info else []
380 for action in list_actions:
381 type_action = action["type_action"] if "type_action" in action else "default"
382 if type_action == "class_paragraph":
383 nb_modif_class_manual += 1
384 # class_action stocke un id_page au lieu d'un id_file => impact important pour corriger, donc on va sans doute garder comme cela 8/4/24
385 id_page = action["id_page"]
386 if id_page not in map_count_modif_per_doc:
387 map_count_modif_per_doc[id_page] = 0
388 map_count_modif_per_doc[id_page] += 1
389 id_paragraph = action["id_paragraph"] if "id_paragraph" in action else None
390 manual_class = action["manual_class"] + "_class" if "manual_class" in action else None
391 manual_class.replace("_class_class", "_class")
392 id_page_int = int(id_page)
393 if id_paragraph == None:
394 print("ERROR due certainly to modified crops modified again by switch 2025-05 ")
395 continue
396 id_paragraph_int = int(id_paragraph)
397 if id_page_int < len(paragraphs):
398 if id_paragraph_int < len(paragraphs[id_page_int]):
399 paragraphs[id_page_int][id_paragraph_int]["class"] = manual_class
400 else:
401 print("ERROR id_paragraph_int greater than len(paragraphs[id_page_int])" + str(id_paragraph_int) + " nb : " + str(len(paragraphs[id_page_int])) + " for " + str(id_page_int))
402 continue
403 else :
404 print("ERROR id_page_int greater than len(paragraphs)" + str(id_page_int) + " nb : " + str(len(paragraphs)))
405 continue
407# audit_info_corrected["io_exec"]["3"]["input"]["paragraphs"] = paragraphs
409 elif type_action == "df_meta_info_correct":
410 key_action = create_key_action(action)
411 count = (map_modif_df_manual[key_action]["count"] + 1) if key_action in map_modif_df_manual and "count" in map_modif_df_manual[key_action] else 1
412 map_modif_df_manual[key_action] = action
413 map_modif_df_manual[key_action]["count"] = count
414 map_modif_df_manual[key_action]["audit_or_datou"] = "datou"
416 # list_action_for_df_to_correct.append(action)
417 if True:
418 key_for_df_as_md = "io_exec/6/output" # df_complet_as_markdown
419 key_for_df_as_md = "io_exec/9/output/df_complet_as_markdown" # df_complet_as_markdown # C'est 9 en prod grrr => ca va pas du tout grrr
420 key_for_df_as_md = manual_action_to_audit_data["df_meta_info_correct"]["audit_data"] if "df_meta_info_correct" in manual_action_to_audit_data and "audit_data" in \
421 manual_action_to_audit_data["df_meta_info_correct"] else key_for_df_as_md
422 df_as_md = load_sub_json(audit_info_corrected, key_for_df_as_md)
423 key_for_df_as_json = "io_exec/9/output/df_complet_as_json"
424 key_for_df_as_json = manual_action_to_audit_data["df_meta_info_correct_json"]["audit_data"] if "df_meta_info_correct_json" in manual_action_to_audit_data and "audit_data" in \
425 manual_action_to_audit_data["df_meta_info_correct_json"] else key_for_df_as_json
426 df_as_json = load_sub_json(audit_info_corrected, key_for_df_as_json)
429 if df_as_json != None:
430 import pandas as pd
431 df = pd.read_json(df_as_json, convert_dates=["datet", "date_entree_hospitalisationt", "date_sortie_hospitalisationt", "date_fin_arret_travailt", "date_debut_arret_travailt"])
433 #from copy import deepcopy
434 if type(df_auto) == types.NoneType:
435 df_auto = df.copy() # deep=True
436 #df_auto = deepcopy(df_as_json)
438 elif df_as_md != None:
439 print("ERROR SINCE 06-2024 : WE NEVER GO THROUGH THIS !?! ")
440 # from lib.batch.lib_batch import create_pandas_table_from_text
441 # df = create_pandas_table_from_text(df_as_md, verbose = False)
442 print("ON 08-2024 : WE COMMENT THE TWO PREVIOUS LINE AND RETURN => LOAD AUDIT WILL CRASH ")
443 return None
444 else:
445 print("ERROR FROM THE START ")
446 from lib.batch.lib_batch import init_df_synchronize
447 df = init_df_synchronize() #pandas.core.dtypes.generic.create_pandas_abc_type("DataFrame", [])
448 if action["col_name"] in df.columns:
449 id_line = action["id"]
450 from lib.manaudit.lib_datou_audit import is_numericf
451 if not is_numericf(id_line):
452# if id_line == "" or (id_line != "0" and not str(id_line).rstrip(".0").isdigit()):
453 print(" action not treated since input data is wrong : " + str(id_line) + " " + str(action))
454 continue
455 manual_value = action["manual_value"]
456 col_name = action["col_name"]
457 if col_name == "datet":
458 #print("Will maybe fail : " + str(action))
459 if type(df.loc[0,"datet"]) != pandas._libs.tslibs.timestamps.Timestamp and type(df.loc[0,"datet"]) != datetime.datetime and type(df.loc[0,"datet"]) != pandas._libs.tslibs.nattype.NaTType:
460 print("Will fail now : " + str(action) + " result id : " + str(result["id"]))
461 print(str(type(df.loc[0,"datet"])))
462 if col_name == "date_entree_hospitalisationt" or col_name == "date_sortie_hospitalisationt" or col_name == "date_fin_arret_travailt" or col_name == "date_debut_arret_travailt" or col_name == "datet":
463 import dateparser
464 manual_value_parsed = dateparser.parse(manual_value)
465 print(manual_value_parsed)
466 if manual_value_parsed == None:
467 continue
468 if col_name == "document_type":
469 map_modif_type_document[id_line] = manual_value
470 if map_col_id_to_id_df == {}:
471 for i in range(len(df)):
472 idx_df = df.index[i]
473 idx_from_data = df.loc[idx_df]["id"]
474 if str(idx_from_data).replace(".0", "").isdigit():
475 map_col_id_to_id_df[str(idx_from_data).replace(".0", "")] = idx_df
476 else:
477 print("ERROR should fail, wrong data will be saved ! : " + str(idx_from_data))
478# if str(idx_df).isdigit():
479# map_col_id_to_id_df[str(idx_df)] = idx_df
480 if str(id_line).replace(".0", "") in map_col_id_to_id_df:
481 id_line_df = map_col_id_to_id_df[str(id_line).replace(".0", "")]
482 #df.iloc[id_line_df][col_name] = manual_value
483 if col_name == "datet" and ("20" not in str(manual_value) and "19" not in str(manual_value)):
484 print("Protect wrong date : " + str(manual_value))
485 manual_value = None
486 df.loc[id_line_df, col_name] = manual_value
487 else:
488 print(" Missing id line in df : " + str(id_line) + " in " + str(map_col_id_to_id_df.keys()))
490# df.iloc[int(float(id_line))][col_name] = manual_value
491 print("TO TEST")
492 df_as_md = df.to_markdown()
493 df_as_json = df.to_json()
495 # refacto renommer front_value et rajouter un
496 manual = bool(action["manual"]) if "manual" in action else False
497 if manual :
498 nb_manual_action_df += 1
500 set_sub_json(audit_info_corrected, key_for_df_as_md, df_as_md)
501 set_sub_json(audit_info_corrected, key_for_df_as_json, df_as_json)
502 # audit_info_corrected["io_exec"]["9"]["output"]["df_complet_as_markdown"] = df.to_markdown()
503 else:
504# print("col_name not in df : " + str(action["col_name"]) + " in " + str(df.columns))
505 list_action_for_df_to_correct.append(action)
506 elif type_action == "crops":
507 id_page = action["id_page"] if "id_page" in action else None
508 if id_page == None:
509 print("ERROR treated as warning id_page not in action")
510 continue
511 paragraphs[id_page] #["crops"] = action["crops"] if "crops" in action else None
513 if "modify" in action:
514 for modification in action["modify"]:
515 if "id" not in modification:
516 print("ERROR id of paragraph not in modification")
517 continue
518 if "x" in modification:
519 x = modification["x"]
520 paragraphs[id_page][int(id)]["x"] = x
521 if "y" in modification:
522 y = modification["y"]
523 paragraphs[id_page][int(id)]["y"] = y
524 if "w" in modification:
525 w = modification["w"]
526 paragraphs[id_page][int(id)]["w"] = w
527 if "h" in modification:
528 h = modification["h"]
529 paragraphs[id_page][int(id)]["h"] = h
530 if "text" in modification:
531 text = modification["text"]
532 paragraphs[id_page][int(id)]["text"] = text
534 if "delete" in action:
535 for id in action["delete"]:
536 if not str(id).isdigit():
537 print("Eror due to deleted crop that have no id")
538 continue
539 if int(id) < len(paragraphs[id_page]):
540 paragraphs[id_page][int(id)] = {}
541# del paragraphs[id_page][int(id)]
542 else:
543 print("ERROR id of paragraph not in modification")
544 if "add" in action:
545 for one_new_par in action["add"]:
546 new_id = len(paragraphs[id_page])
547 one_new_par["id"] = new_id
548 paragraphs[id_page].append(one_new_par)
549 #if id not in paragraphs[id_page]:
550 # paragraphs[id_page][id] = {}
551 #else:
552 # print("ERROR id of paragraph not in modification")
553 else:
554 print("type action not supported : " + str(type_action))
555 except Exception as e:
556 print(str(e))
557 print("Error treated as warning (to be audited) in modify_audit_info_from_manual_correction")
558 audit_info_corrected = audit_info
559# list_action_for_df_to_correct = []
563 results, audit_info_from_datou = parse_audit_info(audit_info_corrected, list_action_for_df_to_correct, map_modif_df_manual)
564 nb_manual_action_df_for_col_audit = audit_info_from_datou["nb_manual_action_df_for_col_audit"] if "nb_manual_action_df_for_col_audit" in audit_info_from_datou else -4
565 map_modif_df_manual = audit_info_from_datou["map_modif_df_manual"] if "map_modif_df_manual" in audit_info_from_datou else {}
566 audit_info_count = {"nb_modif_class_manual" : nb_modif_class_manual, "nb_manual_action_df" : nb_manual_action_df,
567 "nb_manual_action_df_for_col_audit" : nb_manual_action_df_for_col_audit,
568 "map_modif_df_manual" : map_modif_df_manual,
569 "map_count_modif_per_doc" : map_count_modif_per_doc,
570 "map_modif_type_document" : map_modif_type_document}
571 results["paragraphs"] = paragraphs
572 return df_auto, results, audit_info_count
576def load_audit_info_and_apply_manual_correction(hash_id_treatment_auto = None,
577 hash_id_treatment_manual = None,
578 lpgss = None,
579 limit = None,
580 project_id = None,
581 ids_manual = None,
582 manual_action_to_audit_data = {}):
583 import json
584 df_auto = None
585 all_result = lpgss.load_data_audit(hash_id_treatment=hash_id_treatment_manual, col_csv="*")
586 # print(" all result : " + str(all_result))
587 if type(all_result) == type(None):
588 print(" all_result seems none from hash_id_treatment : " + str(hash_id_treatment_manual))
589 all_result = {}
590 print(" all_result : " + str(all_result.keys()))
591 info_date = all_result["info_date"] if "info_date" in all_result else {}
592 info_date["test_var_info_date"] = "We want to find in which hit this data is recorded, either hit_main or cons_hit"
593 audit_json_file_content = all_result["audit_info"] if "audit_info" in all_result else {}
594 id_file = all_result["id_file"] if "id_file" in all_result else None
595 try:
596 audit_json_file_content_as_json = json.loads(audit_json_file_content)
597 except Exception as e:
598 audit_json_file_content_as_json = audit_json_file_content
599 print(
600 str("A présent c'est sur le chemin critique, je pense qu'un ajax règlerait le pb, mais je ne comprends pas"))
601 print(str(e))
603 try:
604 from lib.manaudit.lib_datou_audit import parse_audit_info, modify_audit_info_from_manual_correction
605 if limit != None and int(limit) == 0:
606 all_results = []
607 else:
608 all_results = lpgss.load_data_manual(ids_manual=ids_manual,
609 hash_id_treatment=hash_id_treatment_manual,
610 limit=limit)
611 if len(all_results) == 0:
612 results, audit_info_count = parse_audit_info(audit_json_file_content)
613 nb_manual_action_df_for_col_audit = audit_info_count[
614 "nb_manual_action_df_for_col_audit"] if "nb_manual_action_df_for_col_audit" in audit_info_count else -3
615# audit_info_count = {}
616 df_auto_as_json = None
617 print("There are no correction, to be tested")
618 df_auto = results["df"].copy() # deep = True
619 else:
620 df_auto, results, audit_info_count = modify_audit_info_from_manual_correction(
621 audit_json_file_content_as_json,
622 all_results,
623 manual_action_to_audit_data=manual_action_to_audit_data,
624 project_id=project_id)
625 except Exception as e:
626 results = {"error": str(e)}
627 audit_info_count = {}
628 print(str(e))
629 print("LOG_TO_PARSE : Error parsing audit result json manual : " + str(hash_id_treatment_manual) + " auto " + str(hash_id_treatment_auto))
631 hash_id_treatment_rerun = all_result["info_date"]["consolidate_hash_id_treatment"] if "info_date" in all_result and "consolidate_hash_id_treatment" in all_result["info_date"] else None
632 df_cons = all_result["info_consolidate"]["df_cons"] if "info_consolidate" in all_result and "df_cons" in all_result["info_consolidate"] else None
634 return df_auto, df_cons, hash_id_treatment_rerun, results, audit_info_count, id_file, audit_json_file_content_as_json, all_results, info_date
636def get_list_backup(histo_folder, mtd_id = None):
637 # Dans le dossier histo_folder on veut parser des fichiers json avec des noms du type : datou_anon_42_0213_22.json ou 42 est le datou_id et celui la c'est celui de 22h le 31 février, il faudrait avoir l'année de manière optionnel 2024 par defaut
638 # On veut aussi pouvoir filtrer par mtd_id mais c'est moins important
639 # On veut aussi pouvoir trier par date pour afficher dans le front
640 import os
641 import json
642 map_datou_id_date_backup = {}
643 map_proj_id_date_backup = {}
644 if histo_folder == None:
645 return map_datou_id_date_backup, map_proj_id_date_backup
646 for filename in os.listdir(histo_folder):
647 if filename.endswith(".json"):
648 print("Open " + str(filename))
649 with open(os.path.join(histo_folder, filename)) as f:
650 try:
651 data = json.load(f)
652 except Exception as e:
653 print(" Error reading " + filename + " : " + str(e))
654 continue
656 if "project_id" in data:
657 map_proj_id_date_backup[data["project_id"]] = data
658 continue
660 if type(data) == list :
661 if len(data) == 1:
662 data = data[0]
663 else :
664 print("Maybe warnind size of data in " + filename + " : " + str(len(data)) + " avoiding this file ")
666 if "id" not in data: # to manage the case of the intricate export
667 if "datou" not in data or len(data["datou"]) == 0:
668 print("Unexpected data in " + filename + " : " + str(data) + " avoiding this file ")
669 continue
671 data = data["datou"][0]
673 if mtd_id != None:
674 if "id" in data and int(data["id"]) != int(mtd_id):
675 continue
676 if "id" in data and data["id"] not in map_datou_id_date_backup:
677 map_datou_id_date_backup[data["id"]] = {}
679 print(" filename : " + str(filename))
681 import re
682 # exemple de nom de fichier pour parser la date : datou_anon_42_0213_22.json (et vérifier l'id du datou)
684 suffix = ""
685 try:
686 date_parsed = datetime.datetime.now()
687 m = re.search(r"_(\d+)_([\s\w\W-]*_)?(\d{4})?(\d{2})(\d{2})_(\d{2})", filename)
689 if m:
690 if len(m.groups()) == 5 and m.group(2) != None and m.group(2) != "":
691 print(" PArsed 5 groups ! " + str(m.groups()))# a tester avec le truc ligne 92
692 date_parsed = datetime.datetime(int(m.group(2)), int(m.group(3)), int(m.group(4)),
693 int(m.group(5)))
694 elif len(m.groups()) == 5:
695 print(" PArsed 4 real groups ! " + str(m.groups()))
696 default_year = 2024
697 month = int(m.group(3))
698 day = int(m.group(4))
699 hour = int(m.group(5))
701 date_parsed = datetime.datetime(default_year, month, day, hour)
702 elif len(m.groups()) == 6:
703 print(" Parsed 6 groups ! " + str(m.groups()))
704 default_year = 2024 if (m.group(3) == None or m.group(3) != "") else int(m.group(3))
705 month = int(m.group(4))
706 day = int(m.group(5))
707 hour = int(m.group(6))
708 suffix = m.group(2)
709 if suffix == None:
710 suffix = ""
711 mtd_id_parsed = int(m.group(1))
713 date_parsed = datetime.datetime(default_year, month, day, hour)
714 elif len(m.groups()) == 4:
715 print(" FAll boack not managed with 4 groups, WTF does it mean ? " + str(m.groups()) + " filename : " + filename)
716 continue
717 else:
718 print("Could not parse date from filename : " + filename)
719 continue
720 else:
721 print("Could not parse date from filename : " + filename)
722 except Exception as e:
723 print(str(e))
724 print(
725 "parsing name of backup didn't work, we may have wrong convnetion between export and voila continue : " + filename)
726 continue
728 from lib.lib_util import humanize_modified_time
729 date_parsed_h = humanize_modified_time(date_parsed.replace(tzinfo=None))
730 data["saved_at"] = date_parsed
731 data["saved_at_h"] = date_parsed_h + " : " + str(suffix).rstrip("_") #filename # date_parsed.strftime("%y%m%d_%H")
733# del data["data_str"]
734 map_datou_id_date_backup[data["id"]][date_parsed] = data
736 for mtd_id in map_datou_id_date_backup:
737 map_datou_id_date_backup[mtd_id] = dict(sorted(map_datou_id_date_backup[mtd_id].items(), reverse=True))
739 map_datou_id_date_h_backup = {}
740 for mtr_id in map_datou_id_date_backup:
741 map_datou_id_date_h_backup[mtr_id] = {}
742 for date in map_datou_id_date_backup[mtr_id]:
743 data = map_datou_id_date_backup[mtr_id][date]
744 map_datou_id_date_h_backup[mtr_id][data["saved_at_h"]] = data
746 return map_datou_id_date_h_backup, map_proj_id_date_backup
748# --job=saxia.stat_quali --limit=200 -v --project_id=134
749# --job=saxia.stat_quali --limit=200 -v --project_id=122
751def study_qualite_2024(lpgss = None,
752 type_doc = "document_type",
753 verbose = False,
754 condition_query = "TODO",
755 limit = 100,
756 project_id = None):
758 if lpgss == None:
759 print("Missing DB connector")
760 return None
762 from lib.lib_util import count_and_display_elapsed_time
763 import time
764 begin_time = time.time()
766 try :
767 auto_res = lpgss.load_auto_val(type_doc, verbose, limit, project_id = project_id)
768 except Exception as e:
769 print("Error loading auto_val : " + str(e))
770 import pandas as pd
771 auto_res = pd.DataFrame(columns=["hit", "id_row", "auto_val", "id_file", "id", "hit_id_row"])
773 auto_res.to_csv("auto_res.csv", sep='\t')
774 begin_time, message = count_and_display_elapsed_time(begin_time, "load_auto_val " + type_doc)
776 manual_res = lpgss.load_manual_correct_val(type_doc, verbose, limit, project_id = project_id)
777 manual_res.to_csv("manual_res.csv", sep='\t')
778 begin_time, message = count_and_display_elapsed_time(begin_time, "load_manual_correct_val " + type_doc)
780 all_df_cons = lpgss.load_df_cons(verbose, limit, project_id = project_id)
781 begin_time, message = count_and_display_elapsed_time(begin_time, "load_df_cons ")
782 big_df = None
783 import pandas as pd
784 for data in all_df_cons:
785 one_df = pd.DataFrame(data["df_cons"])
786 one_df["id_doc"] = 0
787 one_df["id_mde"] = 0
788 id_file = data["id_file"]
789 id_mde = data["id"] if "id" in data else None
790 one_df["id_file" ] = id_file
791 for idx, row in one_df.iterrows():
792 if row["id"] == "NaN" or row["id"] == "" or not str(row["id"]).isdigit():
793 print("Warning : id is NaN or empty in one_df, skipping this row")
794 continue
795 one_df.at[idx, "id"] = 1000 * id_mde + int(row["id"])
796 one_df.at[idx, "id_mde"] = id_mde
797 one_df.at[idx, "id_doc"] = int(row["id"])
798 str_id = id_file + "_l_" + str(row["id"])
799 # one_df.at[idx, "id"] = str_id
801 if big_df is None:
802 big_df = one_df
803 else:
804 big_df = pd.concat([big_df, one_df], ignore_index=True)
806 big_df.to_csv('big_df.csv', sep='\t')
807 begin_time, message = count_and_display_elapsed_time(begin_time, "build big_df ")
810 # build key
811 key = "hit_id_row"
812 for idx in auto_res.index:
813 data = auto_res.loc[idx]
814 auto_res.loc[idx, key] = str(data["hit"]) + "_" + str(data["id_row"])
815 for idx in manual_res.index:
816 data = manual_res.loc[idx]
817 manual_res.loc[idx, key] = str(data["hit"]) + "_" + str(data["id_row"])
818 # set manual_val in auto_res
820 map_count_key_value = {}
821 total = 0
823 for idx in auto_res.index:
824 data = auto_res.loc[idx]
825 key_val = data["hit_id_row"]
827 total += 1
828 try:
829# if True:#key in manual_res.index:
830 idxs = manual_res.loc[manual_res['hit_id_row'] == key_val].index
831 if len(idxs) == 1:
832 auto_res.loc[idx, "manual_val"] = manual_res.loc[idxs[0]]["manual_val"]
833 val = manual_res.loc[idxs[0]]["manual_val"]
834 else:
835 if len(idxs) > 1:
836 print(" Not treated while unexpected multiple value")
837 auto_res.loc[idx, "manual_val"] = data["auto_val"]
838 val = data["auto_val"]
840 if val not in map_count_key_value:
841 map_count_key_value[val] = 0
842 map_count_key_value[val] += 1
843 except Exception as e:
844 print(str(e))
845 auto_res.loc[idx, "manual_val"] = data["auto_val"]
846# print("missing key " + key + " in auto_res")
847 # set manual_val as auto_res in missing
849 map_count_key_value_ordered = dict(sorted(map_count_key_value.items(), key=lambda item: item[1], reverse=True))
851 print(" total : " + str(total))
852 print(" map_count_key_value_ordered : " + str(map_count_key_value_ordered))
853 count_quantile_90 = 0
854 count_nb_to_keep = 0
855 for key in map_count_key_value_ordered:
856 count_quantile_90 += map_count_key_value_ordered[key]
857 count_nb_to_keep += 1
858 if count_quantile_90 > 0.9 * total:
859 break
861 print(" Kept count_nb_to_keep over : " + str(count_nb_to_keep) + " overs " + str(len(map_count_key_value_ordered)))
864 if type_doc == "document_type":
865 list_key_value_doc_type = ["cr_operation","cr_exam","cr_hospit","cr_urgence","courrier","facture_utile","facture","ordonnance","certif_blessure","certif_at","attestation","certif_medical","certif_hospitalisation","autre","ordonnance_medicament_exception","cr_pompier","facture_inutile"]
866 else:
867 list_key_value_doc_type = list(map_count_key_value_ordered.keys())
868 # [:count_nb_to_keep]
869 print(" list_key_value_doc_type : " + str(list_key_value_doc_type))
870# type_doc
871 # create dataframe with these list as columns and rows
872 import pandas as pd
873# df = pd.DataFrame(columns=list_key_value_doc_type)
875 # Now compute confusion matrix
876 associated_labels = {}
877 predicted_labels = {}
878 list_associated_labels = list_key_value_doc_type + ["Unknown Label"]
879 list_predicted_labels = list_key_value_doc_type + ["Unknown Label"]
880 list_data_on_sen_fout = []
881 map_list_error = {}
882 idx_voila = 0
883 for idx in auto_res.index:
884 data = auto_res.loc[idx]
885 manual_val = data["manual_val"]
886 auto_val = data["auto_val"]
887 idx_voila += 1 # data["hit_id_row"]
888 associated_labels[idx_voila] = manual_val
889 predicted_labels[idx_voila] = auto_val
890 if auto_val != manual_val:
891 key_diff = str(auto_val) + "P" + str(manual_val)
892 if key_diff not in map_list_error:
893 map_list_error[key_diff] = []
895 map_list_error[key_diff].append((data["hit"], data["id_row"]))
897 if type_doc == "document_type":
899 from pyfvs.lib.advanced.lib_confusion_matrix import compute_confusion_matrix
900 mat_conf = compute_confusion_matrix(associated_labels, predicted_labels,
901 list_associated_labels, list_predicted_labels, list_data_on_sen_fout = [])
902 print("mat_conf : " + str(mat_conf))
903 print(mat_conf)
904 with open("mat_conf_" + type_doc + ".html", "w") as f:
905 f.write(str(mat_conf.to_html()))
906 print(" mat_conf.txt written")
908 print("<br><br> Error type_doc : " + type_doc + "<br><br>")
909 for d in list_associated_labels:
910 for p in list_predicted_labels:
911 if mat_conf.at[p, d] > 0:
912 key_list_diff = str(p) + "P" + str(d)
913 if key_list_diff in map_list_error:
914 print("<br>mat_conf.at[" + d + "," + p + "] : " + str(mat_conf.at[p, d]))
915 for data in map_list_error[key_list_diff]:
916 print("<br><a href='https://safia.app/manax?hash_id_treatment=" + str(data[0]) + "&id_line=" + str(data[1]) + "'> LINE " + str(data[1]) + " " + str(p) + " to " + str(d) + " </a>")
917 # print("mat_conf.at[" + d + "," + p + "] : " + str(mat_conf.at[p, d]))
918 # for data in auto_res:
919 # if data["auto_val"] == p and data["manual_val"] == d:
920 # print("data : " + str(data))
921 else:
922 print("<br>\n".join(list(map_list_error.keys())))
924 print("TO CHECK WIP 15-8-24")
926# --job=saxia.stat_quali --limit=200 -v --project_id=94 --in_file=condition_intro_doc,only_correct_prediag,condition_only_one_page
928def study_qualite(lpgss = None,
929 type_doc = "document_type",
930 verbose = False,
931 condition_query = "TODO",
932 limit = 100,
933 project_id = None,
934 condition_intro_doc = True,
935 only_correct_prediag = True,
936 condition_only_one_page = True,
937 list_to_study = "document_type,medecin_nom,medecin_specialite,datet",
938 with_out_folder = False,
939 prepare_data_set = False):
941 if lpgss == None:
942 print("Missing DB connector")
943 return None
945 list_dataset = []
947 map_list_input_by_document = {}
948 list_complete = []
949 if condition_intro_doc == True:
950 if project_id == None:
951 load_conf_from_project_id = 91
952 else:
953 load_conf_from_project_id = project_id
955 from lib.lib_safia_system import LibSafiaSystem
956 lss = LibSafiaSystem(lib_user_data_internal=lpgss)
957 raw_configuration = lss.load_conf_project(load_conf_from_project_id)
958 from lib.util.lib_formal_conf import formal_conf_prepare
959 configuration = formal_conf_prepare(raw_configuration, lss)
960 intro_format_intro = load_sub_json(configuration, "saxia/format/info_format_intro/format/intro")
962 from lib.lib_util import compute_list_input_to_format_per_document
963 map_list_input_by_document = compute_list_input_to_format_per_document(intro_format_intro)
964 # On a aussi besoin de la liste complète des types de document, donc on veut faire une concatenation des listes qui sont les valeurs du amp_list_input_by_document
965 list_complete = []
966 for key in map_list_input_by_document:
967 list_complete += map_list_input_by_document[key]
968 list_complete = list(set(list_complete))
970 from lib.lib_util import count_and_display_elapsed_time
971 import time
972 begin_time = time.time()
974 all_df_cons = lpgss.load_df_cons(limit = limit, project_id = project_id, verbose=verbose, with_out_folder=False) #with_out_folder)
975 list_mhit = []
976 map_mhit_outfolder = {}
977 for data in all_df_cons:
978 if "mhit" in data and data["mhit"] != None:
979 list_mhit.append(data["mhit"])
980 if with_out_folder:
981 list_mhit_out_folder = lpgss.load_output_folder_from_mhit(list_mhit, verbose=verbose)
982 for r in list_mhit_out_folder:
983 if "mhit" in r and "out_folder" in r:
984 if r["out_folder"] != None and r["mhit"] != None :
985 map_mhit_outfolder[r["mhit"]] = r["out_folder"] + "/" + r["mhit"]
986 else:
987 print("Warning : missing out_folder for mhit " + str(r))
988 begin_time, message = count_and_display_elapsed_time(begin_time, "load_df_cons ")
989 big_df = None
990 import pandas as pd
991 map_id_file_count_nan = {}
993 for data in all_df_cons:
994 one_df = pd.DataFrame(data["df_cons"])
995 one_df["id_doc"] = 0
996 one_df["id_mde"] = 0
997 one_df["image_path"] = ""
998 id_file = data["id_file"]
999 out_folder = data["out_folder"] if "out_folder" in data else ""
1000 mhit = data["mhit"] if "mhit" in data else None
1001 if mhit != None and mhit in map_mhit_outfolder:
1002 out_folder = map_mhit_outfolder[mhit]
1003 id_mde = data["id"] if "id" in data else None
1004 one_df["id_file" ] = id_file
1005 map_id_file_count_nan[id_file] = 0
1006 for idx, row in one_df.iterrows():
1007 if row["id"] == "NaN" or row["id"] == "" or not str(row["id"]).isdigit():
1008 if row["Liste des pages"] != "":
1009 map_id_file_count_nan[id_file] += 1
1010# print("Warning : id is NaN or empty in one_df, skipping this row : " + str(row))
1011 continue
1012 one_df.at[idx, "id"] = 1000 * id_mde + int(row["id"])
1013 one_df.at[idx, "id_mde"] = id_mde
1014 one_df.at[idx, "id_doc"] = int(row["id"])
1015 one_df.at[idx, "image_path"] = out_folder + "/page_" + str(row["Liste des pages"]) + ".png" if out_folder != "" else ""
1016 str_id = str(id_file) + "_l_" + str(row["id"])
1017 # one_df.at[idx, "id"] = str_id
1019 if big_df is None:
1020 big_df = one_df
1021 else:
1022 big_df = pd.concat([big_df, one_df], ignore_index=True)
1024 print("map_id_file_count_nan[id_file] " + str(map_id_file_count_nan[id_file]) + " rows with problem and non null Liste des Pages")
1026 print(len(big_df), " rows in big_df")
1028 # On ne va garder que les lignes qui ont "document_type" et "Liste des pages" non vides
1029 if "document_type" in big_df.columns and "Liste des pages" in big_df.columns:
1030 big_df = big_df[big_df["document_type"].notna() & big_df["Liste des pages"].notna()]
1031 print(len(big_df), " rows in big_df")
1032 big_df = big_df[big_df["document_type"] != '']
1033 print(len(big_df), " rows in big_df")
1034 big_df = big_df[big_df["Liste des pages"] != '']
1036 print(len(big_df), " rows in big_df")
1038 big_df["nb_word_cr"] = -1
1039 big_df["nb_word_quantile"] = -1
1040 map_list_quantile = {5:"0_5",20:"5_20",50:"20_50",150:"50_150",1000:"150_1000",10000:"1000_infini"}
1041 for idx in big_df.index:
1042 data = big_df.loc[idx]
1043 if "cr_correct_typo" in data and not pd.isna(data["cr_correct_typo"]):
1044 nb_word_cr = len(str(data["cr_correct"]).split())
1045 big_df.at[idx, "nb_word_cr"] = nb_word_cr
1046 for q in map_list_quantile:
1047 if nb_word_cr < q:
1048 break
1049 big_df.at[idx, "nb_word_quantile"] = map_list_quantile[q]
1051 big_df.to_csv('big_df.csv', sep='\t')
1052 begin_time, message = count_and_display_elapsed_time(begin_time, "build big_df ")
1054 print("big_df : " + str(big_df.head(10)))
1055 big_df = big_df.reset_index(drop=True)
1056# big_df["ERROR"] = 'COL'
1058 # Initialisation d'un dataframe avec les memes colonnes que big_df
1059 df_to_study = pd.DataFrame(columns=big_df.columns)
1060 # Rajout de la première ligne
1061 #df_to_study = pd.concat([df_to_study, big_df.iloc[0:0]], ignore_index=True)
1063 list_one_dim_distribution = ["document_type", "medecin_specialite",
1064 "document_type_auto", "medecin_specialite_auto",
1065 "Nombre de pages", "prediag", "Liste des pages", "nb_word_quantile"]
1066 for c in list_one_dim_distribution:
1067 if c not in big_df.columns:
1068 print("Column " + c + " not in big_df, skipping")
1069 continue
1070 print("Processing column STUDY ONE DIM " + c)
1071 map_val_nb = {}
1072 count_total = 0
1073 count_empty = 0
1074 count_not_empty = 0
1076 for idx in big_df.index:
1077 data = big_df.loc[idx]
1078 count_total += 1
1079 val = data[c] if c in big_df.columns else None
1080 if pd.isna(val) or val == "":
1081 count_empty += 1
1082 else:
1083 count_not_empty += 1
1084 if type(val) == list:
1085 print("ERROR: value is a list, converting to string for counting: " + str(val))
1086 val = str(val)
1087 if val not in map_val_nb:
1088 map_val_nb[val] = 0
1089 map_val_nb[val] += 1
1090 # Display distribution
1091 print("Distribution for column " + c + ":")
1092 # Order by nb decreasing
1093 map_val_nb = dict(sorted(map_val_nb.items(), key=lambda item: item[1], reverse=True))
1094 for val in map_val_nb:
1095 if count_total == 0:
1096 print(f"{val}: {map_val_nb[val]} (0.00%)")
1097 else:
1098 print(f"{val}: {map_val_nb[val]} ({round(map_val_nb[val] / count_total * 100, 2)}%)")
1100 # Affiche les résultats
1101 print("Column " + c + " :")
1102 print("Total rows: " + str(count_total))
1103 if count_total == 0:
1104 print("Empty auto: 0 (0.00%)")
1105 print("Not empty auto: 0 (0.00%)")
1106 else:
1107 print("Empty auto: " + str(count_empty) + " (" + str(round(count_empty / count_total * 100, 2)) + "%)")
1108 print("Not empty auto: " + str(count_not_empty) + " (" + str(round(count_not_empty / count_total * 100, 2)) + "%)")
1109 print("")
1111 # On va maintenant faire un tableau de différentes paires de colonnes
1112 list_paires = [("document_type", "prediag"), ("document_type", "nb_word_quantile"), ("prediag", "nb_word_quantile")]
1113 for (col1, col2) in list_paires:
1114 if col1 not in big_df.columns or col2 not in big_df.columns:
1115 print("Column " + col1 + " or " + col2 + " not in big_df, skipping")
1116 continue
1117 print("Processing column pair STUDY TWO DIM " + col1 + " and " + col2)
1118 map_val_nb = {}
1119 map_val1_nb = {}
1120 count_total = 0
1121 count_empty = 0
1122 count_not_empty = 0
1124 for idx in big_df.index:
1125 data = big_df.loc[idx]
1126 count_total += 1
1127 val1 = data[col1]
1128 val2 = data[col2]
1129 if pd.isna(val1) or val1 == "" or pd.isna(val2) or val2 == "":
1130 count_empty += 1
1131 else:
1132 count_not_empty += 1
1133 if str(val1) not in map_val_nb:
1134 map_val_nb[str(val1)] = {}
1135 map_val1_nb[str(val1)] = 0
1136 if str(val2) not in map_val_nb[str(val1)]:
1137 map_val_nb[str(val1)][str(val2)] = 0
1138 key = str(val1) + "_" + str(val2)
1140 map_val_nb[str(val1)][str(val2)] += 1
1141 map_val1_nb[str(val1)] += 1
1143 # Display distribution
1144 print("Distribution for columns " + col1 + " and " + col2 + ":")
1145 # Order by nb decreasing
1146# map_val_nb = dict(sorted(map_val_nb.items(), key=lambda item: item[1], reverse=True))
1147 for k1 in map_val_nb:
1148 print(f"{k1}: {map_val1_nb[k1]} ({round(map_val1_nb[k1] / count_total * 100, 2)}%) ")
1149 for k2 in map_val_nb[k1]:
1150 import sys
1151 sys.stdout.write(f"{k2} : {map_val_nb[k1][k2]} ({round(map_val_nb[k1][k2] / map_val1_nb[k1] * 100, 2)}%) ")
1152 print("")
1154 # Affiche les résultats
1155 print("Columns " + col1 + " and " + col2 + ":")
1156 print("Total rows: " + str(count_total))
1157 print("Empty auto: " + str(count_empty) + " (" + str(round(count_empty / count_total * 100, 2)) + "%)")
1158 print("Not empty auto: " + str(count_not_empty) + " (" + str(round(count_not_empty / count_total * 100, 2)) + "%)")
1159 print("")
1163 list_comparison = []
1165 # Calcul des taux d'erreur et de correct pour chacune des colonnes
1166 # , 'Nombre de pages'
1167 list_comparison.append(("intro_correct_typo", "intro_back"))
1168 for col_ref in ["medecin_nom", "medecin_prenom", "document_type", 'Liste des pages', 'Titre',
1169 'medecin_specialite', 'nom_hopital', 'genre_service_hopital',
1170 'indication_examen', 'date_entree_hospitalisation',
1171 'date_sortie_hospitalisation', 'motif_hospitalisation',
1172 'date_fin_arret_travail', 'TitreMeta', 'datet', 'date_parsed_or_forced',
1173 'date_fin_arret_travailt', 'date_entree_hospitalisationt', 'date_sortie_hospitalisationt']:
1174 col_auto = col_ref + "_auto"
1175 col_manual = col_ref # + "_manual"
1176 list_comparison.append((col_manual, col_auto))
1178 list_comparison.append(("cr_correct_typo", "cr_back"))
1180 list_error_for_meta_data = []
1181 for (col_manual, col_auto) in list_comparison:
1182 if col_auto not in big_df.columns or col_manual not in big_df.columns:
1183 print("Column " + col_auto + " or " + col_manual + " not in big_df, skipping")
1184 continue
1185 list_errors = []
1186 list_commentaires_corrects = []
1187 list_commentaires_errors = []
1188 print("Processing column " + col_manual)
1189 count_ignore = 0
1190 count_total = 0
1191 count_correct = 0
1192 count_error = 0
1193 count_empty = 0
1194 count_not_empty = 0
1195 count_not_empty_auto = 0
1196 count_not_empty_manual = 0
1197 for idx in big_df.index:
1198 doc_type = big_df.loc[idx, "document_type"]
1199 comms = big_df.loc[idx, "Commentaires"]
1200 if doc_type in map_list_input_by_document:
1201 list_input = map_list_input_by_document[doc_type]
1202 if col_manual not in list_input and col_manual in list_complete and col_manual != 'document_type':
1203 # print("Ignoring column " + col_manual + " for document type " + doc_type)
1204 count_ignore += 1
1205# list_error_for_meta_data.append(idx)
1206 continue
1207 else :
1208 if condition_intro_doc:
1209 print("MISSING DATA TO ANALYZE : " + str(doc_type))
1210 pass
1211 if condition_only_one_page :
1212 #if "Nombre de page" in big_df.columns :
1213 if "Liste des pages_auto" in big_df.columns and "Liste des pages" in big_df.columns:
1214 if str(big_df.loc[idx, "Liste des pages_auto"]).strip().isdigit() and str(big_df.loc[idx, "Liste des pages"]).strip().isdigit() and int(big_df.loc[idx, "Liste des pages"]) == int(big_df.loc[idx, "Liste des pages_auto"]):
1215 pass
1216 else :
1217 if "," not in str(big_df.loc[idx, "Liste des pages_auto"]) or "," not in str(big_df.loc[idx, "Liste des pages"]):
1218 if "none" != str(big_df.loc[idx, "Liste des pages_auto"]).lower():
1219 print("UNEXPECTED LISTE DES PAGES " + str(big_df.loc[idx, "Liste des pages_auto"]) + "||" + str(big_df.loc[idx, "Liste des pages"]))
1220 else :
1221 import sys
1222 sys.stdout.write("ñø")
1223 # Plusieurs pages
1224 count_ignore += 1
1225 list_error_for_meta_data.append(idx)
1226 continue
1227 else :
1228 print("UNEXPECTED missing Nombre de page on document_type : " + str(doc_type) + " ")
1229 # print("UNEXPECTED missing info intro on document_type : " + str(doc_type) + " ")
1230 if only_correct_prediag:
1231 if big_df.loc[idx, "prediag"] not in ("OK", "BON"):
1232 if big_df.loc[idx, "prediag"] not in ("", "AUTRE", "CERFA_MANUSCRIT", "MANUSCRIT", "TABLEAU", "MAUVAIS", "PRESQUEBON"):
1233 print("UNEXPECTED " + big_df.loc[idx, "prediag"])
1234 count_ignore += 1
1235 list_error_for_meta_data.append(idx)
1236 continue
1237 else:
1238 if big_df.loc[idx, "prediag"] == "":
1239 count_ignore += 1
1240 list_error_for_meta_data.append(idx)
1241 continue
1242 data = big_df.loc[idx]
1243 count_total += 1
1244 auto_val = data[col_auto]
1245 manual_val = data[col_manual]
1246 if pd.isna(auto_val) or auto_val == "":
1247 count_empty += 1
1248 else:
1249 count_not_empty += 1
1250 count_not_empty_auto += 1
1251 if type(manual_val) == list or pd.isna(manual_val) or manual_val == "":
1252 continue
1253 count_not_empty_manual += 1
1254 id_file = data["id_file"]
1255 lp_id = data["Liste des pages"]
1256 mde_id = int(data["id"]/ 1000) if type(data["id"]) != types.NoneType else ""
1257 if str(auto_val).lower() == str(manual_val).lower():
1258 count_correct += 1
1259 list_commentaires_corrects.append(comms)
1260 else:
1261 count_error += 1
1262 list_error_for_meta_data.append(idx)
1263 if idx in [13, 19, 25, 28, 29, 30, 31, 32, 34, 38, 41, 42, 43, 55]:
1264 print("Unexpected wrond meta data but perfect cr_back !")
1265 comms_start = comms[:32] if type(comms) == str else ""
1266 list_errors.append((mde_id, lp_id, manual_val, auto_val, comms_start))
1267 list_commentaires_errors.append(comms)
1268 if condition_intro_doc and doc_type in map_list_input_by_document and col_manual in map_list_input_by_document[doc_type]:
1269 if idx not in list_error_for_meta_data:
1270 list_error_for_meta_data.append(idx)
1271 intro_correct = big_df.loc[idx, "intro_correct_typo"]
1272 if not col_manual.startswith("date") and manual_val.lower() not in intro_correct.lower():
1273 print("Error in " + col_manual + " for " + doc_type + " : " + str(manual_val) + " not in " + str(intro_correct) + " for idx : " + str(idx))
1274 print("A ANALYSER")
1275 if col_manual in list_to_study:
1276 try:
1277# big_df.iloc[idx, "ERROR"] = col_manual
1278 # On va ajouter la ligne dans le dataframe df_to_study
1279 big_df.loc[idx, "Commentaires"] = "DIFFERENCE in " + col_manual + " : " + comms
1280 one_more_line = big_df.iloc[idx:idx + 1]
1281 df_to_study = pd.concat([df_to_study, one_more_line], ignore_index=True)
1282 print("Padam")
1283 except Exception as e:
1284 print("Error adding row to df_to_study for idx " + str(idx) + ": " + str(e))
1285 continue
1288 # Affiche les résultats
1289 print("Column " + col_manual + " :")
1290 print("Ignored rows: " + str(count_ignore))
1291 print("Total rows: " + str(count_total))
1292 if count_total == 0:
1293 print("Empty auto: 0 (0.00%)")
1294 print("Not empty auto: 0 (0.00%)")
1295 else:
1296 print("Correct: " + str(count_correct) + " (" + str(round(count_correct / count_total * 100, 2)) + "%)")
1297 print("Errors: " + str(count_error) + " (" + str(round(count_error / count_total * 100, 2)) + "%)")
1298 print("Empty auto: " + str(count_empty) + " (" + str(round(count_empty / count_total * 100, 2)) + "%)")
1299 print("Not empty auto: " + str(count_not_empty_auto) + " (" + str(round(count_not_empty_auto / count_total * 100, 2)) + "%)")
1300 print("Not empty manual: " + str(count_not_empty_manual) + " (" + str(round(count_not_empty_manual / count_total * 100, 2)) + "%)")
1301 print("Listes de paires d'erreurs")
1302 print(str(list_errors))
1303 print("Listes de commentaires dans les cas d'erreurs ")
1304 print(str(list_commentaires_errors))
1305 print("Listes de commentaires dans les cas corrects ")
1306 print(str(list_commentaires_corrects))
1307 print("")
1309 list_complete =list(big_df.index) # list(range(0, len(big_df)))
1310 list_total_correct = [] #list(big_df.index)
1311 for idx in list_complete:
1312 if idx not in list_error_for_meta_data:
1313 list_total_correct.append(idx)
1315 print("Total correct : " + str(len(list_total_correct)))
1316 print(str(list_total_correct))
1318 if condition_intro_doc:
1319 list_all_correct = list(big_df.index[big_df["intro_correct_typo"] == big_df["intro_back"]]) + list_total_correct
1320 for idx in list_all_correct:
1321 print("idx : " + str(idx) + " intro_correct_typo : " + str(big_df.loc[idx, "id"]))
1323 print(big_df.index[big_df["intro_correct_typo"] == big_df["intro_back"]])
1324 print(list_total_correct)
1325 for idx in list_all_correct:
1326 try :
1327 data = big_df.loc[idx]
1328 doc_type = data["document_type"]
1329 list_needed_column = map_list_input_by_document.get(doc_type, [])
1330 input_as_json = {}
1331 for f in list_needed_column:
1332 if f in data and not pd.isna(data[f]) and data[f] != "":
1333 if f.startswith("date"):
1334 import datetime
1335 data_used = datetime.datetime.strptime(data[f][:10], "%Y-%m-%d")
1336 else :
1337 data_used = data[f]
1338 input_as_json[f] = data_used
1339 else:
1340 input_as_json[f] = None
1341 from lib.lib_util import format_one_res
1342 if doc_type in intro_format_intro:
1343 intro_from_manual_saved = format_one_res(input_as_json, intro_format_intro[doc_type], format_premier=False, format_date="%d %B %Y")
1344 else:
1345 print("Missing intro_format_intro for doc_type : " + str(doc_type))
1346 intro_from_manual_saved = ""
1347 print(" idx : " + str(idx))
1348 print(" id : " + str(data["id"]))
1349 print(" intro_from_manual_saved :" + str(intro_from_manual_saved))
1350 print(" intro_correct_typo : " + str(data["intro_correct_typo"]))
1351 print(" intro_back : " + str(data["intro_back"]))
1352 except Exception as e:
1353 print("Error processing idx " + str(idx) + ": " + str(e))
1354 continue
1356 if prepare_data_set:
1357 for idx, data in big_df.iterrows():
1358 doc_type = data["document_type"]
1359 if doc_type in map_list_input_by_document:
1360 list_needed_column = map_list_input_by_document[doc_type]
1361 else:
1362 print("Missing doc_type in map_list_input_by_document : " + str(doc_type))
1363 continue
1364 one_data_set = {}
1365 if "image_path" not in data:
1366 print("Missing image_path, did you use with_out_folder ? " + str(data) + " for doc_type : " + str(doc_type) + ", skipping this row.")
1367 continue
1368 # image_path
1369 one_data_set["image_path"] = data["image_path"] if "image_path" in data else ""
1370 json_extract = {}
1371 for c in list_needed_column:
1372 if c in data and not pd.isna(data[c]) and data[c] != "":
1373 if c.startswith("date"):
1374 try:
1375 import datetime
1376 json_extract[c] = datetime.datetime.strptime(data[c][:10], "%Y-%m-%d")
1377 except Exception as e:
1378 print("Error parsing date for " + c + ": " + str(e))
1379 json_extract[c] = "None"
1380 else:
1381 json_extract[c] = data[c]
1382 else:
1383 json_extract[c] = None
1384 print("Missing data !")
1385 continue
1386 one_data_set["extract_meta_data"] = json_extract
1387 list_dataset.append(one_data_set)
1389 import datetime
1390 jour_suffix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
1391 list_dataset_out_file = "dataset_" + jour_suffix + ".json"
1392 with open(list_dataset_out_file, "w") as f:
1393 import json
1394 # Convert list_dataset to JSON format
1395 # Ensure that datetime objects are converted to strings
1397 #f.write(str(list_dataset))
1398 f.write(json.dumps(list_dataset, default=str, indent=4))
1400 df_data = pd.DataFrame(list_dataset, columns=["image_path", "extract_meta_data"])
1401 list_dataset_out_file = "dataset_as_df_" + jour_suffix + ".csv"
1402 df_data.to_csv(list_dataset_out_file, sep='\t', index=False)
1404 from lib.lib_ml.lib_util_prepare_dataset import from_csv_create_json_dataset
1405 from_csv_create_json_dataset(list_dataset_out_file, crop=False,
1406 server_root="https://safia.app",
1407 folder_root="",
1408 download_or_get_local_file=False, # used only for crop !
1409 col_url_path="image_path",
1410 col_text="extract_meta_data",
1411 sep='\t')
1414 df_to_study = df_to_study.reset_index(drop=True)
1415 return df_to_study