Coverage for lib/manaudit/lib_datou_audit.py: 4%

1042 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2026-02-10 01:10 +0100

1import datetime 

2import types 

3 

4import pandas.core.dtypes.generic 

5 

6 

7# on pourrait extraire le datou des audit_info : il est juste au debut Ah ha ah ! 

8# Mais on a pas de fonction pour construire le datou à partir des json 

9# on pourrait aussi extraire les man_audit_info du datou plutot que du audit, enfin bon ! 

10# il faut nécessairement chercher dans les datou, ou au moins le dict qui le représente le param json utile 

11 

12def parse_audit_info(audit_info, list_action_for_df_to_correct = [], map_modif_df_manual = {}): 

13 try: 

14 # En fait il suffit de trouver load_tab 

15 if "io_exec" not in audit_info: 

16 return {}, {} 

17 

18 panda_table_content_as_markdown = "" 

19 panda_table_content_as_json = {} 

20 data_to_parse_for_json = "" 

21 suffix = "" 

22 nb_manual_action_df_for_col_audit = 0 

23 

24 map_col_id_to_id_df = {} 

25 

26 for steps in audit_info["io_exec"]: 

27 if not steps.isdigit(): 

28 continue 

29 type_step = audit_info["io_exec"][steps]["datou_step"] 

30 param_json = audit_info["io_exec"][steps]["param_json"] 

31 input = audit_info["io_exec"][steps]["input"] 

32 output = audit_info["io_exec"][steps]["output"] 

33 if type_step == "load_tab": 

34 if "assoc" in param_json: 

35 print("Will maybe faile, we need to find the right assoc") 

36 if panda_table_content_as_markdown != "": 

37 print("Will maybe fail, since we have two load tab step") 

38 panda_table_content_as_markdown = input["result"] if "result" in input else "" 

39 if type_step == "image_to_text": 

40 suffix = "google_ocr" 

41 # param_json = 

42 if type_step == "map_reduce": 

43 print("We want the second map_reduce, like id_step 7, ou peut-etre n'importe laquelle le champt result de input ?? ") 

44 if "text" in output: 

45 data_to_parse_for_json = output["text"] 

46 if type_step == "format": 

47 if "df_complet_as_markdown" in output: 

48 panda_table_content_as_markdown = output["df_complet_as_markdown"] 

49 if "df_complet_as_json" in output: 

50 panda_table_content_as_json = output["df_complet_as_json"] 

51 pass 

52 

53 from lib.batch.lib_batch import create_pandas_table_from_text 

54 

55 df = create_pandas_table_from_text(panda_table_content_as_markdown) 

56 import pandas as pd 

57 if panda_table_content_as_json != {} and type(panda_table_content_as_json) != types.NoneType and panda_table_content_as_json != "": 

58 df = pd.read_json(panda_table_content_as_json, convert_dates=["datet", "date_entree_hospitalisationt", "date_sortie_hospitalisationt", "date_fin_arret_travailt", "date_debut_arret_travailt"]) 

59 if type(df) == type(None): 

60 print("pd none from : " + str(panda_table_content_as_markdown)) 

61 df = pd.DataFrame(columns=["Null Column"]) 

62 else: 

63 df["intro_correct"] = "Le ..." 

64 df["cr_correct"] = "Le ..." 

65 df["cr_correct_typo"] = "" 

66 df["intro_correct_typo"] = "" 

67 df["cr_front_init"] = "init" 

68 df["intro_front_init"] = "init" 

69 df["auto_state"] = """<input type="checkbox" value="" name="bordered-checkbox" class="w-4 h-4 text-blue-600 bg-gray-100 border-gray-300 rounded focus:ring-blue-500 dark:focus:ring-blue-600 dark:ring-offset-gray-800 focus:ring-2 dark:bg-gray-700 dark:border-gray-600">Automation impossible""" 

70 df["auto_state_val"] = "false" 

71 df["class_vert_rouge_valider"] = "false" 

72 

73 from lib.lib_util import parse_json_from_prompt_result, \ 

74 complete_date_and_order_json_to_mettre_en_forme, append_id_by_order 

75 list_json_to_mettre_en_forme = parse_json_from_prompt_result(data_to_parse_for_json) 

76 list_json_to_mettre_en_forme = append_id_by_order(list_json_to_mettre_en_forme) 

77 

78 # if order_by_date: 

79 # list_json_to_mettre_en_forme = complete_date_and_order_json_to_mettre_en_forme(list_json_to_mettre_en_forme) 

80 

81 from lib.lib_util import add_parsing_meta_info_to_table 

82 # VR 29-3-24 : a mon avis cela a deja été réalisé 

83# df = add_parsing_meta_info_to_table(df, list_json_to_mettre_en_forme) 

84 

85 if len(list_action_for_df_to_correct) > 0: 

86 for action in list_action_for_df_to_correct: 

87 type_action = action["type_action"] if "type_action" in action else "default" 

88 if type_action == "df_meta_info_correct": 

89 key_action = create_key_action(action) 

90 count = (map_modif_df_manual[key_action]["count"] + 1) if key_action in map_modif_df_manual and "count" in map_modif_df_manual[key_action] else 1 

91 map_modif_df_manual[key_action] = action 

92 map_modif_df_manual[key_action]["count"] = count 

93 map_modif_df_manual[key_action]["audit_or_datou"] = "audit" 

94 

95 id_line = action["id"] 

96 if id_line == 'NaN': 

97 continue 

98 manual_value = action["manual_value"] 

99 col_name = action["col_name"] 

100 # refacto renommer front_value et rajouter un 

101 manual = bool(action["manual"]) if "manual" in action else False 

102 if manual : 

103 nb_manual_action_df_for_col_audit += 1 

104 #df.loc[int(id_line)][col_name] = manual_value 

105 try : 

106 # maanging incomplete parsing of document => not clear why it creates this but there are some link in audit bug 

107 id_line_int = int(str(id_line).replace(".0", "")) 

108 except Exception as e: 

109 print(" Pb with action id_line : " + str(id_line) + " " + str(e) + " parsing as float then int !") 

110 if id_line != 'None': 

111 print("ERROR treated as warning need to be audited") 

112 id_line_int = int(float(id_line)) 

113 else : 

114 continue 

115 

116 if map_col_id_to_id_df == {}: 

117 for i in range(len(df)): 

118 idx_df = df.index[i] 

119 idx_from_data = df.loc[idx_df]["id"] 

120 if str(idx_from_data).replace(".0", "").isdigit(): 

121 map_col_id_to_id_df[str(idx_from_data).replace(".0", "")] = idx_df 

122 else : 

123 print("ERROR should fail, wrong data will be saved ! " + str(idx_from_data)) 

124 if str(idx_df).isdigit(): 

125 map_col_id_to_id_df[str(idx_df)] = idx_df 

126 if str(id_line).replace(".0", "") in map_col_id_to_id_df: 

127 id_line_df = map_col_id_to_id_df[str(id_line).replace(".0", "")] 

128 # df.iloc[id_line_df][col_name] = manual_value 

129 df.loc[id_line_df, col_name] = manual_value 

130 # df.loc[:,('one','second')] 

131 else: 

132 print(" Missing id line in df : " + str(id_line) + " in " + str(map_col_id_to_id_df.keys())) 

133 

134# df.loc[id_line_int, col_name] = manual_value 

135 else: 

136 print("type action not supported : " + str(type_action)) 

137 

138 df_to_html = df.to_html(classes="table pdt-table table-striped", escape=False) 

139 

140 # TODO VR : a mettre dans la html clairement ! contrairement à ce que dit chat gpt 

141 return {"dataframe_html" : df_to_html, 

142 "dataframe_text" : panda_table_content_as_markdown, 

143 "suffix" : suffix, 

144 "df" : df, 

145 "paragraphs" : []}, {"nb_manual_action_df_for_col_audit" : nb_manual_action_df_for_col_audit, "map_modif_df_manual" : map_modif_df_manual} 

146 except Exception as e: 

147 print(str(e)) 

148 return {"dataframe_html": None, 

149 "dataframe_text": "", 

150 "suffix": ""}, {"nb_manual_action_df_for_col_audit" : nb_manual_action_df_for_col_audit, "map_modif_df_manual" : map_modif_df_manual} 

151 

152# dataframe_html = df.to_html() 

153 

154def load_sub_json(main_json, key_list_slash): 

155 try: 

156 if key_list_slash == "": 

157 return main_json 

158 separator = "/" 

159 # if "/" not in key_list_slash and "." in key_list_slash: 

160 # separator = "." 

161 list_keys = key_list_slash.lstrip(separator).split(separator) 

162 sub_json = main_json 

163 for k in list_keys: 

164 if type(sub_json) == list: 

165 if not k.isdigit(): 

166 print("Unexpected index for array in load_sub_json : " + str(k)) 

167 return None 

168 k_int = int(k) 

169 if k_int >= len(sub_json): 

170 print("Not enough value in array in load_sub_json") 

171 return None 

172 sub_json = sub_json[k_int] 

173 elif k in sub_json: # expected type dict 

174 sub_json = sub_json[k] 

175 # if type(sub_json) not in (dict, list): 

176 # return sub_json 

177 else: 

178 # Dict with key as integer, so we need to convert k to int used in io_exec 

179 if k.isdigit(): 

180 if int(k) in sub_json: 

181 sub_json = sub_json[int(k)] 

182 else: 

183 print(" missing key " + k + " in " + str(sub_json.keys())) 

184 return None 

185 return sub_json 

186 except Exception as e: 

187 print("Exception in load_sub_json") 

188 print(str(e)) 

189 return None 

190 

191def set_sub_json(main_json, key_list_slash, value): 

192 separator = "/" 

193# if "/" not in key_list_slash and "." in key_list_slash: 

194# separator = "." 

195 list_keys = key_list_slash.lstrip(separator).split(separator) 

196 if key_list_slash.lstrip(separator) == "": 

197 return value 

198 sub_json = main_json 

199 for k in list_keys[:-1]: 

200 if type(sub_json) == list: 

201 if not k.isdigit(): 

202 print("Unexpected index for array in load_sub_json : " + str(k)) 

203 return None 

204 k_int = int(k) 

205 if k_int >= len(sub_json): 

206 print("Not enough value in array in load_sub_json") 

207 return None 

208 sub_json = sub_json[k_int] 

209 elif k in sub_json: 

210 sub_json = sub_json[k] 

211 else: 

212 print(" missing key " + k + " in " + str(sub_json.keys())) 

213 sub_json[list_keys[-1]] = value 

214 return main_json 

215 

216def create_key_action(action): 

217 if "type_action" not in action or action["type_action"] != "df_meta_info_correct" or "col_name" not in action or "id" not in action: 

218 return None 

219 return str(action["id"]) + "_" + str(action["col_name"]) 

220 

221def is_numericf(input): 

222 if input == "NaN": 

223 return False 

224 try: 

225 float(input) 

226 return True 

227 except ValueError: 

228 return False 

229 except Exception as e: 

230 print("Unexpected error treated as warning") 

231 print(str(e)) 

232 return False 

233 

234 

235 

236def count_time_lab_by_user(all_result_manual_correct, delta_min_between_save_minutes = 10): 

237 map_user_id_time_modif = {} 

238 try: 

239 for result in all_result_manual_correct: 

240 user_id = result["user_id"] if "user_id" in result else 0 

241 if user_id not in map_user_id_time_modif: 

242 map_user_id_time_modif[user_id] = [] 

243 if "created_at" in result and type(result["created_at"]) == datetime.datetime: 

244 map_user_id_time_modif[user_id].append(result["created_at"]) 

245 except Exception as e: 

246 print(str(e)) 

247 

248 map_result_time_by_user = {} 

249 

250 for user_id in map_user_id_time_modif: 

251 map_user_id_time_modif[user_id] = sorted(list(set(map_user_id_time_modif[user_id]))) 

252 count_minutes = 0 

253 list_intervals = [] 

254 previous_time = None 

255 for time_modif in map_user_id_time_modif[user_id]: 

256 if previous_time == None: 

257 previous_time = time_modif 

258 list_intervals.append([time_modif, time_modif]) 

259 else: 

260 delta = time_modif - previous_time 

261 delta_in_minutes = delta.total_seconds() / 60.0 

262 if delta_in_minutes <= delta_min_between_save_minutes: 

263 # same interval 

264 list_intervals[-1][1] = time_modif 

265 else: 

266 list_intervals.append([time_modif, time_modif]) 

267 previous_time = time_modif 

268 

269 for interval in list_intervals: 

270 delta = interval[1] - interval[0] 

271 delta_in_minutes = delta.total_seconds() / 60.0 

272 count_minutes += delta_in_minutes + delta_min_between_save_minutes 

273 map_result_time_by_user[user_id] = {"intervals" : list_intervals, 

274 "total_minutes" : count_minutes} 

275 

276 return map_result_time_by_user 

277 

278 

279 

280def list_action_by_user(all_result_manual_correct, df_complete): 

281 map_user_id_list_pages = {} 

282 map_user_id_list_pages_for_split = {} 

283 map_user_id_time_modif = {} 

284 try: 

285 for result in all_result_manual_correct: 

286 if "manual_input_info" not in result or "list_actions" not in result["manual_input_info"]: 

287 print("Warning missing actions in result : " + str(result)) 

288 continue 

289 user_id = result["user_id"] if "user_id" in result else 0 

290 if user_id not in map_user_id_list_pages: 

291 map_user_id_list_pages[user_id] = [] 

292 map_user_id_time_modif[user_id] = [] 

293 map_user_id_list_pages_for_split[user_id] = [] 

294 if "created_at" in result and type(result["created_at"]) == datetime.datetime: 

295 map_user_id_time_modif[user_id].append(result["created_at"]) 

296 for action in result["manual_input_info"]["list_actions"]: 

297 if "id_page" in action: 

298 if not str(action["id_page"]).isdigit(): 

299 print("How come this id_page isn't an int : " + str(action["id_page"])) 

300 continue 

301 id_page = int(action["id_page"]) 

302 if id_page not in map_user_id_list_pages[user_id]: 

303 map_user_id_list_pages[user_id].append(id_page) 

304 if "col_name" in action and action["col_name"] == "Liste des pages": 

305 if id_page not in map_user_id_list_pages_for_split[user_id]: 

306 map_user_id_list_pages_for_split[user_id].append(id_page) 

307 elif "id" not in action or not is_numericf(str(action["id"])): 

308 print("Warning missing id in action : " + str(action)) 

309 continue 

310 else: 

311 id = int(float(action["id"])) 

312 list_pages = df_complete.loc[df_complete["id"] == id, "Liste des pages"].values[0] if "Liste des pages" in df_complete.columns else "" 

313 if list_pages == "" or not list_pages.replace(",", "").isdigit(): 

314 print("Warning missing list_pages in df_complete for id : " + str(id)) 

315 continue 

316 list_pages_as_list_int = [int(x) for x in list_pages.split(",") if x.strip().isdigit()] 

317 map_user_id_list_pages[user_id].extend(list_pages_as_list_int) 

318 if "col_name" in action and action["col_name"] == "Liste des pages": 

319 map_user_id_list_pages_for_split[user_id].extend(list_pages_as_list_int) 

320 for user_id in map_user_id_list_pages: 

321 map_user_id_list_pages[user_id] = sorted(list(set(map_user_id_list_pages[user_id]))) 

322 for user_id in map_user_id_list_pages_for_split: 

323 map_user_id_list_pages_for_split[user_id] = sorted(list(set(map_user_id_list_pages_for_split[user_id]))) 

324 except Exception as e: 

325 print(str(e)) 

326 

327 return map_user_id_list_pages, map_user_id_time_modif, map_user_id_list_pages_for_split 

328 

329def modify_audit_info_from_manual_correction(audit_info, all_result_manual_correct, 

330 manual_action_to_audit_data = {}, 

331 project_id = None, 

332 lib_user_data_internal = None): 

333 if manual_action_to_audit_data == {}: 

334 if lib_user_data_internal == None: 

335 print (" We need access to database to get the conf_project and saxia param") 

336 else : 

337 print(" We will load the configuration saxia from the project_id") 

338 if project_id == None: 

339 print("Error project_id is None and no manual_action_to_audit_data") 

340 

341 # We can still try to get the project_id from the hash_id_token, but won't do it right now 

342 hash_id_treatment = audit_info["config"]["complete_param_json"]["hash_id_treatment"] if "config" in audit_info and "complete_param_json" in audit_info["config"] and "hash_id_treatment" in audit_info["config"]["complete_param_json"] else None 

343 all_result = lib_user_data_internal.load_data_audit(hash_id_treatment=hash_id_treatment) 

344 # print(" all_result : " + str(all_result.keys())) 

345 project_id = all_result["project_id"] if "project_id" in all_result else None 

346 

347 if project_id == None: 

348 print(" We can't do anything") 

349 else : 

350 conf_project = lib_user_data_internal.load_conf_project(project_id) 

351 saxia_conf = conf_project["saxia"] if "saxia" in conf_project else {} 

352 assoc_conf = saxia_conf["assoc"] if "assoc" in saxia_conf else {} 

353 manual_action_to_audit_data = assoc_conf["manual_action_to_audit_data"] if "manual_action_to_audit_data" in assoc_conf else {} 

354 

355 nb_modif_class_manual = 0 

356 nb_manual_action_df = 0 

357 map_modif_df_manual = {} # permettrait d'ailleurs une projection 

358 paragraphs = [] 

359 df_auto = None 

360 try: 

361 # key_as_list_slash = "io_exec/0/output/paragraphs" 

362 key_as_list_slash = "io_exec/3/input/paragraphs" 

363 key_as_list_slash = manual_action_to_audit_data["class_paragraph"]["audit_data"] if "class_paragraph" in manual_action_to_audit_data and "audit_data" in manual_action_to_audit_data["class_paragraph"] else key_as_list_slash 

364 audit_info_corrected = audit_info 

365 

366 map_col_id_to_id_df = {} 

367 

368 

369 list_action_for_df_to_correct = [] 

370 paragraphs = load_sub_json(audit_info_corrected, key_as_list_slash) 

371 # order modification by increasing date 

372 all_result_manual_correct_ordered = sorted(all_result_manual_correct, key=lambda x: x["created_at"]) 

373 map_count_modif_per_doc = {} 

374 map_modif_type_document = {} 

375 for result in all_result_manual_correct_ordered: 

376 created_at_as_string = result["created_at"].strftime("%y%m%d_%H:%m:%S") 

377 print(" Modification from created_at_as_string :" + created_at_as_string + " result id : " + str(result["id"])) 

378 manual_input_info = result["manual_input_info"] if "manual_input_info" in result else {} 

379 list_actions = manual_input_info["list_actions"] if "list_actions" in manual_input_info else [] 

380 for action in list_actions: 

381 type_action = action["type_action"] if "type_action" in action else "default" 

382 if type_action == "class_paragraph": 

383 nb_modif_class_manual += 1 

384 # class_action stocke un id_page au lieu d'un id_file => impact important pour corriger, donc on va sans doute garder comme cela 8/4/24 

385 id_page = action["id_page"] 

386 if id_page not in map_count_modif_per_doc: 

387 map_count_modif_per_doc[id_page] = 0 

388 map_count_modif_per_doc[id_page] += 1 

389 id_paragraph = action["id_paragraph"] if "id_paragraph" in action else None 

390 manual_class = action["manual_class"] + "_class" if "manual_class" in action else None 

391 manual_class.replace("_class_class", "_class") 

392 id_page_int = int(id_page) 

393 if id_paragraph == None: 

394 print("ERROR due certainly to modified crops modified again by switch 2025-05 ") 

395 continue 

396 id_paragraph_int = int(id_paragraph) 

397 if id_page_int < len(paragraphs): 

398 if id_paragraph_int < len(paragraphs[id_page_int]): 

399 paragraphs[id_page_int][id_paragraph_int]["class"] = manual_class 

400 else: 

401 print("ERROR id_paragraph_int greater than len(paragraphs[id_page_int])" + str(id_paragraph_int) + " nb : " + str(len(paragraphs[id_page_int])) + " for " + str(id_page_int)) 

402 continue 

403 else : 

404 print("ERROR id_page_int greater than len(paragraphs)" + str(id_page_int) + " nb : " + str(len(paragraphs))) 

405 continue 

406 

407# audit_info_corrected["io_exec"]["3"]["input"]["paragraphs"] = paragraphs 

408 

409 elif type_action == "df_meta_info_correct": 

410 key_action = create_key_action(action) 

411 count = (map_modif_df_manual[key_action]["count"] + 1) if key_action in map_modif_df_manual and "count" in map_modif_df_manual[key_action] else 1 

412 map_modif_df_manual[key_action] = action 

413 map_modif_df_manual[key_action]["count"] = count 

414 map_modif_df_manual[key_action]["audit_or_datou"] = "datou" 

415 

416 # list_action_for_df_to_correct.append(action) 

417 if True: 

418 key_for_df_as_md = "io_exec/6/output" # df_complet_as_markdown 

419 key_for_df_as_md = "io_exec/9/output/df_complet_as_markdown" # df_complet_as_markdown # C'est 9 en prod grrr => ca va pas du tout grrr 

420 key_for_df_as_md = manual_action_to_audit_data["df_meta_info_correct"]["audit_data"] if "df_meta_info_correct" in manual_action_to_audit_data and "audit_data" in \ 

421 manual_action_to_audit_data["df_meta_info_correct"] else key_for_df_as_md 

422 df_as_md = load_sub_json(audit_info_corrected, key_for_df_as_md) 

423 key_for_df_as_json = "io_exec/9/output/df_complet_as_json" 

424 key_for_df_as_json = manual_action_to_audit_data["df_meta_info_correct_json"]["audit_data"] if "df_meta_info_correct_json" in manual_action_to_audit_data and "audit_data" in \ 

425 manual_action_to_audit_data["df_meta_info_correct_json"] else key_for_df_as_json 

426 df_as_json = load_sub_json(audit_info_corrected, key_for_df_as_json) 

427 

428 

429 if df_as_json != None: 

430 import pandas as pd 

431 df = pd.read_json(df_as_json, convert_dates=["datet", "date_entree_hospitalisationt", "date_sortie_hospitalisationt", "date_fin_arret_travailt", "date_debut_arret_travailt"]) 

432 

433 #from copy import deepcopy 

434 if type(df_auto) == types.NoneType: 

435 df_auto = df.copy() # deep=True 

436 #df_auto = deepcopy(df_as_json) 

437 

438 elif df_as_md != None: 

439 print("ERROR SINCE 06-2024 : WE NEVER GO THROUGH THIS !?! ") 

440 # from lib.batch.lib_batch import create_pandas_table_from_text 

441 # df = create_pandas_table_from_text(df_as_md, verbose = False) 

442 print("ON 08-2024 : WE COMMENT THE TWO PREVIOUS LINE AND RETURN => LOAD AUDIT WILL CRASH ") 

443 return None 

444 else: 

445 print("ERROR FROM THE START ") 

446 from lib.batch.lib_batch import init_df_synchronize 

447 df = init_df_synchronize() #pandas.core.dtypes.generic.create_pandas_abc_type("DataFrame", []) 

448 if action["col_name"] in df.columns: 

449 id_line = action["id"] 

450 from lib.manaudit.lib_datou_audit import is_numericf 

451 if not is_numericf(id_line): 

452# if id_line == "" or (id_line != "0" and not str(id_line).rstrip(".0").isdigit()): 

453 print(" action not treated since input data is wrong : " + str(id_line) + " " + str(action)) 

454 continue 

455 manual_value = action["manual_value"] 

456 col_name = action["col_name"] 

457 if col_name == "datet": 

458 #print("Will maybe fail : " + str(action)) 

459 if type(df.loc[0,"datet"]) != pandas._libs.tslibs.timestamps.Timestamp and type(df.loc[0,"datet"]) != datetime.datetime and type(df.loc[0,"datet"]) != pandas._libs.tslibs.nattype.NaTType: 

460 print("Will fail now : " + str(action) + " result id : " + str(result["id"])) 

461 print(str(type(df.loc[0,"datet"]))) 

462 if col_name == "date_entree_hospitalisationt" or col_name == "date_sortie_hospitalisationt" or col_name == "date_fin_arret_travailt" or col_name == "date_debut_arret_travailt" or col_name == "datet": 

463 import dateparser 

464 manual_value_parsed = dateparser.parse(manual_value) 

465 print(manual_value_parsed) 

466 if manual_value_parsed == None: 

467 continue 

468 if col_name == "document_type": 

469 map_modif_type_document[id_line] = manual_value 

470 if map_col_id_to_id_df == {}: 

471 for i in range(len(df)): 

472 idx_df = df.index[i] 

473 idx_from_data = df.loc[idx_df]["id"] 

474 if str(idx_from_data).replace(".0", "").isdigit(): 

475 map_col_id_to_id_df[str(idx_from_data).replace(".0", "")] = idx_df 

476 else: 

477 print("ERROR should fail, wrong data will be saved ! : " + str(idx_from_data)) 

478# if str(idx_df).isdigit(): 

479# map_col_id_to_id_df[str(idx_df)] = idx_df 

480 if str(id_line).replace(".0", "") in map_col_id_to_id_df: 

481 id_line_df = map_col_id_to_id_df[str(id_line).replace(".0", "")] 

482 #df.iloc[id_line_df][col_name] = manual_value 

483 if col_name == "datet" and ("20" not in str(manual_value) and "19" not in str(manual_value)): 

484 print("Protect wrong date : " + str(manual_value)) 

485 manual_value = None 

486 df.loc[id_line_df, col_name] = manual_value 

487 else: 

488 print(" Missing id line in df : " + str(id_line) + " in " + str(map_col_id_to_id_df.keys())) 

489 

490# df.iloc[int(float(id_line))][col_name] = manual_value 

491 print("TO TEST") 

492 df_as_md = df.to_markdown() 

493 df_as_json = df.to_json() 

494 

495 # refacto renommer front_value et rajouter un 

496 manual = bool(action["manual"]) if "manual" in action else False 

497 if manual : 

498 nb_manual_action_df += 1 

499 

500 set_sub_json(audit_info_corrected, key_for_df_as_md, df_as_md) 

501 set_sub_json(audit_info_corrected, key_for_df_as_json, df_as_json) 

502 # audit_info_corrected["io_exec"]["9"]["output"]["df_complet_as_markdown"] = df.to_markdown() 

503 else: 

504# print("col_name not in df : " + str(action["col_name"]) + " in " + str(df.columns)) 

505 list_action_for_df_to_correct.append(action) 

506 elif type_action == "crops": 

507 id_page = action["id_page"] if "id_page" in action else None 

508 if id_page == None: 

509 print("ERROR treated as warning id_page not in action") 

510 continue 

511 paragraphs[id_page] #["crops"] = action["crops"] if "crops" in action else None 

512 

513 if "modify" in action: 

514 for modification in action["modify"]: 

515 if "id" not in modification: 

516 print("ERROR id of paragraph not in modification") 

517 continue 

518 if "x" in modification: 

519 x = modification["x"] 

520 paragraphs[id_page][int(id)]["x"] = x 

521 if "y" in modification: 

522 y = modification["y"] 

523 paragraphs[id_page][int(id)]["y"] = y 

524 if "w" in modification: 

525 w = modification["w"] 

526 paragraphs[id_page][int(id)]["w"] = w 

527 if "h" in modification: 

528 h = modification["h"] 

529 paragraphs[id_page][int(id)]["h"] = h 

530 if "text" in modification: 

531 text = modification["text"] 

532 paragraphs[id_page][int(id)]["text"] = text 

533 

534 if "delete" in action: 

535 for id in action["delete"]: 

536 if not str(id).isdigit(): 

537 print("Eror due to deleted crop that have no id") 

538 continue 

539 if int(id) < len(paragraphs[id_page]): 

540 paragraphs[id_page][int(id)] = {} 

541# del paragraphs[id_page][int(id)] 

542 else: 

543 print("ERROR id of paragraph not in modification") 

544 if "add" in action: 

545 for one_new_par in action["add"]: 

546 new_id = len(paragraphs[id_page]) 

547 one_new_par["id"] = new_id 

548 paragraphs[id_page].append(one_new_par) 

549 #if id not in paragraphs[id_page]: 

550 # paragraphs[id_page][id] = {} 

551 #else: 

552 # print("ERROR id of paragraph not in modification") 

553 else: 

554 print("type action not supported : " + str(type_action)) 

555 except Exception as e: 

556 print(str(e)) 

557 print("Error treated as warning (to be audited) in modify_audit_info_from_manual_correction") 

558 audit_info_corrected = audit_info 

559# list_action_for_df_to_correct = [] 

560 

561 

562 

563 results, audit_info_from_datou = parse_audit_info(audit_info_corrected, list_action_for_df_to_correct, map_modif_df_manual) 

564 nb_manual_action_df_for_col_audit = audit_info_from_datou["nb_manual_action_df_for_col_audit"] if "nb_manual_action_df_for_col_audit" in audit_info_from_datou else -4 

565 map_modif_df_manual = audit_info_from_datou["map_modif_df_manual"] if "map_modif_df_manual" in audit_info_from_datou else {} 

566 audit_info_count = {"nb_modif_class_manual" : nb_modif_class_manual, "nb_manual_action_df" : nb_manual_action_df, 

567 "nb_manual_action_df_for_col_audit" : nb_manual_action_df_for_col_audit, 

568 "map_modif_df_manual" : map_modif_df_manual, 

569 "map_count_modif_per_doc" : map_count_modif_per_doc, 

570 "map_modif_type_document" : map_modif_type_document} 

571 results["paragraphs"] = paragraphs 

572 return df_auto, results, audit_info_count 

573 

574 

575 

576def load_audit_info_and_apply_manual_correction(hash_id_treatment_auto = None, 

577 hash_id_treatment_manual = None, 

578 lpgss = None, 

579 limit = None, 

580 project_id = None, 

581 ids_manual = None, 

582 manual_action_to_audit_data = {}): 

583 import json 

584 df_auto = None 

585 all_result = lpgss.load_data_audit(hash_id_treatment=hash_id_treatment_manual, col_csv="*") 

586 # print(" all result : " + str(all_result)) 

587 if type(all_result) == type(None): 

588 print(" all_result seems none from hash_id_treatment : " + str(hash_id_treatment_manual)) 

589 all_result = {} 

590 print(" all_result : " + str(all_result.keys())) 

591 info_date = all_result["info_date"] if "info_date" in all_result else {} 

592 info_date["test_var_info_date"] = "We want to find in which hit this data is recorded, either hit_main or cons_hit" 

593 audit_json_file_content = all_result["audit_info"] if "audit_info" in all_result else {} 

594 id_file = all_result["id_file"] if "id_file" in all_result else None 

595 try: 

596 audit_json_file_content_as_json = json.loads(audit_json_file_content) 

597 except Exception as e: 

598 audit_json_file_content_as_json = audit_json_file_content 

599 print( 

600 str("A présent c'est sur le chemin critique, je pense qu'un ajax règlerait le pb, mais je ne comprends pas")) 

601 print(str(e)) 

602 

603 try: 

604 from lib.manaudit.lib_datou_audit import parse_audit_info, modify_audit_info_from_manual_correction 

605 if limit != None and int(limit) == 0: 

606 all_results = [] 

607 else: 

608 all_results = lpgss.load_data_manual(ids_manual=ids_manual, 

609 hash_id_treatment=hash_id_treatment_manual, 

610 limit=limit) 

611 if len(all_results) == 0: 

612 results, audit_info_count = parse_audit_info(audit_json_file_content) 

613 nb_manual_action_df_for_col_audit = audit_info_count[ 

614 "nb_manual_action_df_for_col_audit"] if "nb_manual_action_df_for_col_audit" in audit_info_count else -3 

615# audit_info_count = {} 

616 df_auto_as_json = None 

617 print("There are no correction, to be tested") 

618 df_auto = results["df"].copy() # deep = True 

619 else: 

620 df_auto, results, audit_info_count = modify_audit_info_from_manual_correction( 

621 audit_json_file_content_as_json, 

622 all_results, 

623 manual_action_to_audit_data=manual_action_to_audit_data, 

624 project_id=project_id) 

625 except Exception as e: 

626 results = {"error": str(e)} 

627 audit_info_count = {} 

628 print(str(e)) 

629 print("LOG_TO_PARSE : Error parsing audit result json manual : " + str(hash_id_treatment_manual) + " auto " + str(hash_id_treatment_auto)) 

630 

631 hash_id_treatment_rerun = all_result["info_date"]["consolidate_hash_id_treatment"] if "info_date" in all_result and "consolidate_hash_id_treatment" in all_result["info_date"] else None 

632 df_cons = all_result["info_consolidate"]["df_cons"] if "info_consolidate" in all_result and "df_cons" in all_result["info_consolidate"] else None 

633 

634 return df_auto, df_cons, hash_id_treatment_rerun, results, audit_info_count, id_file, audit_json_file_content_as_json, all_results, info_date 

635 

636def get_list_backup(histo_folder, mtd_id = None): 

637 # Dans le dossier histo_folder on veut parser des fichiers json avec des noms du type : datou_anon_42_0213_22.json ou 42 est le datou_id et celui la c'est celui de 22h le 31 février, il faudrait avoir l'année de manière optionnel 2024 par defaut 

638 # On veut aussi pouvoir filtrer par mtd_id mais c'est moins important 

639 # On veut aussi pouvoir trier par date pour afficher dans le front 

640 import os 

641 import json 

642 map_datou_id_date_backup = {} 

643 map_proj_id_date_backup = {} 

644 if histo_folder == None: 

645 return map_datou_id_date_backup, map_proj_id_date_backup 

646 for filename in os.listdir(histo_folder): 

647 if filename.endswith(".json"): 

648 print("Open " + str(filename)) 

649 with open(os.path.join(histo_folder, filename)) as f: 

650 try: 

651 data = json.load(f) 

652 except Exception as e: 

653 print(" Error reading " + filename + " : " + str(e)) 

654 continue 

655 

656 if "project_id" in data: 

657 map_proj_id_date_backup[data["project_id"]] = data 

658 continue 

659 

660 if type(data) == list : 

661 if len(data) == 1: 

662 data = data[0] 

663 else : 

664 print("Maybe warnind size of data in " + filename + " : " + str(len(data)) + " avoiding this file ") 

665 

666 if "id" not in data: # to manage the case of the intricate export 

667 if "datou" not in data or len(data["datou"]) == 0: 

668 print("Unexpected data in " + filename + " : " + str(data) + " avoiding this file ") 

669 continue 

670 

671 data = data["datou"][0] 

672 

673 if mtd_id != None: 

674 if "id" in data and int(data["id"]) != int(mtd_id): 

675 continue 

676 if "id" in data and data["id"] not in map_datou_id_date_backup: 

677 map_datou_id_date_backup[data["id"]] = {} 

678 

679 print(" filename : " + str(filename)) 

680 

681 import re 

682 # exemple de nom de fichier pour parser la date : datou_anon_42_0213_22.json (et vérifier l'id du datou) 

683 

684 suffix = "" 

685 try: 

686 date_parsed = datetime.datetime.now() 

687 m = re.search(r"_(\d+)_([\s\w\W-]*_)?(\d{4})?(\d{2})(\d{2})_(\d{2})", filename) 

688 

689 if m: 

690 if len(m.groups()) == 5 and m.group(2) != None and m.group(2) != "": 

691 print(" PArsed 5 groups ! " + str(m.groups()))# a tester avec le truc ligne 92 

692 date_parsed = datetime.datetime(int(m.group(2)), int(m.group(3)), int(m.group(4)), 

693 int(m.group(5))) 

694 elif len(m.groups()) == 5: 

695 print(" PArsed 4 real groups ! " + str(m.groups())) 

696 default_year = 2024 

697 month = int(m.group(3)) 

698 day = int(m.group(4)) 

699 hour = int(m.group(5)) 

700 

701 date_parsed = datetime.datetime(default_year, month, day, hour) 

702 elif len(m.groups()) == 6: 

703 print(" Parsed 6 groups ! " + str(m.groups())) 

704 default_year = 2024 if (m.group(3) == None or m.group(3) != "") else int(m.group(3)) 

705 month = int(m.group(4)) 

706 day = int(m.group(5)) 

707 hour = int(m.group(6)) 

708 suffix = m.group(2) 

709 if suffix == None: 

710 suffix = "" 

711 mtd_id_parsed = int(m.group(1)) 

712 

713 date_parsed = datetime.datetime(default_year, month, day, hour) 

714 elif len(m.groups()) == 4: 

715 print(" FAll boack not managed with 4 groups, WTF does it mean ? " + str(m.groups()) + " filename : " + filename) 

716 continue 

717 else: 

718 print("Could not parse date from filename : " + filename) 

719 continue 

720 else: 

721 print("Could not parse date from filename : " + filename) 

722 except Exception as e: 

723 print(str(e)) 

724 print( 

725 "parsing name of backup didn't work, we may have wrong convnetion between export and voila continue : " + filename) 

726 continue 

727 

728 from lib.lib_util import humanize_modified_time 

729 date_parsed_h = humanize_modified_time(date_parsed.replace(tzinfo=None)) 

730 data["saved_at"] = date_parsed 

731 data["saved_at_h"] = date_parsed_h + " : " + str(suffix).rstrip("_") #filename # date_parsed.strftime("%y%m%d_%H") 

732 

733# del data["data_str"] 

734 map_datou_id_date_backup[data["id"]][date_parsed] = data 

735 

736 for mtd_id in map_datou_id_date_backup: 

737 map_datou_id_date_backup[mtd_id] = dict(sorted(map_datou_id_date_backup[mtd_id].items(), reverse=True)) 

738 

739 map_datou_id_date_h_backup = {} 

740 for mtr_id in map_datou_id_date_backup: 

741 map_datou_id_date_h_backup[mtr_id] = {} 

742 for date in map_datou_id_date_backup[mtr_id]: 

743 data = map_datou_id_date_backup[mtr_id][date] 

744 map_datou_id_date_h_backup[mtr_id][data["saved_at_h"]] = data 

745 

746 return map_datou_id_date_h_backup, map_proj_id_date_backup 

747 

748# --job=saxia.stat_quali --limit=200 -v --project_id=134 

749# --job=saxia.stat_quali --limit=200 -v --project_id=122 

750 

751def study_qualite_2024(lpgss = None, 

752 type_doc = "document_type", 

753 verbose = False, 

754 condition_query = "TODO", 

755 limit = 100, 

756 project_id = None): 

757 

758 if lpgss == None: 

759 print("Missing DB connector") 

760 return None 

761 

762 from lib.lib_util import count_and_display_elapsed_time 

763 import time 

764 begin_time = time.time() 

765 

766 try : 

767 auto_res = lpgss.load_auto_val(type_doc, verbose, limit, project_id = project_id) 

768 except Exception as e: 

769 print("Error loading auto_val : " + str(e)) 

770 import pandas as pd 

771 auto_res = pd.DataFrame(columns=["hit", "id_row", "auto_val", "id_file", "id", "hit_id_row"]) 

772 

773 auto_res.to_csv("auto_res.csv", sep='\t') 

774 begin_time, message = count_and_display_elapsed_time(begin_time, "load_auto_val " + type_doc) 

775 

776 manual_res = lpgss.load_manual_correct_val(type_doc, verbose, limit, project_id = project_id) 

777 manual_res.to_csv("manual_res.csv", sep='\t') 

778 begin_time, message = count_and_display_elapsed_time(begin_time, "load_manual_correct_val " + type_doc) 

779 

780 all_df_cons = lpgss.load_df_cons(verbose, limit, project_id = project_id) 

781 begin_time, message = count_and_display_elapsed_time(begin_time, "load_df_cons ") 

782 big_df = None 

783 import pandas as pd 

784 for data in all_df_cons: 

785 one_df = pd.DataFrame(data["df_cons"]) 

786 one_df["id_doc"] = 0 

787 one_df["id_mde"] = 0 

788 id_file = data["id_file"] 

789 id_mde = data["id"] if "id" in data else None 

790 one_df["id_file" ] = id_file 

791 for idx, row in one_df.iterrows(): 

792 if row["id"] == "NaN" or row["id"] == "" or not str(row["id"]).isdigit(): 

793 print("Warning : id is NaN or empty in one_df, skipping this row") 

794 continue 

795 one_df.at[idx, "id"] = 1000 * id_mde + int(row["id"]) 

796 one_df.at[idx, "id_mde"] = id_mde 

797 one_df.at[idx, "id_doc"] = int(row["id"]) 

798 str_id = id_file + "_l_" + str(row["id"]) 

799 # one_df.at[idx, "id"] = str_id 

800 

801 if big_df is None: 

802 big_df = one_df 

803 else: 

804 big_df = pd.concat([big_df, one_df], ignore_index=True) 

805 

806 big_df.to_csv('big_df.csv', sep='\t') 

807 begin_time, message = count_and_display_elapsed_time(begin_time, "build big_df ") 

808 

809 

810 # build key 

811 key = "hit_id_row" 

812 for idx in auto_res.index: 

813 data = auto_res.loc[idx] 

814 auto_res.loc[idx, key] = str(data["hit"]) + "_" + str(data["id_row"]) 

815 for idx in manual_res.index: 

816 data = manual_res.loc[idx] 

817 manual_res.loc[idx, key] = str(data["hit"]) + "_" + str(data["id_row"]) 

818 # set manual_val in auto_res 

819 

820 map_count_key_value = {} 

821 total = 0 

822 

823 for idx in auto_res.index: 

824 data = auto_res.loc[idx] 

825 key_val = data["hit_id_row"] 

826 

827 total += 1 

828 try: 

829# if True:#key in manual_res.index: 

830 idxs = manual_res.loc[manual_res['hit_id_row'] == key_val].index 

831 if len(idxs) == 1: 

832 auto_res.loc[idx, "manual_val"] = manual_res.loc[idxs[0]]["manual_val"] 

833 val = manual_res.loc[idxs[0]]["manual_val"] 

834 else: 

835 if len(idxs) > 1: 

836 print(" Not treated while unexpected multiple value") 

837 auto_res.loc[idx, "manual_val"] = data["auto_val"] 

838 val = data["auto_val"] 

839 

840 if val not in map_count_key_value: 

841 map_count_key_value[val] = 0 

842 map_count_key_value[val] += 1 

843 except Exception as e: 

844 print(str(e)) 

845 auto_res.loc[idx, "manual_val"] = data["auto_val"] 

846# print("missing key " + key + " in auto_res") 

847 # set manual_val as auto_res in missing 

848 

849 map_count_key_value_ordered = dict(sorted(map_count_key_value.items(), key=lambda item: item[1], reverse=True)) 

850 

851 print(" total : " + str(total)) 

852 print(" map_count_key_value_ordered : " + str(map_count_key_value_ordered)) 

853 count_quantile_90 = 0 

854 count_nb_to_keep = 0 

855 for key in map_count_key_value_ordered: 

856 count_quantile_90 += map_count_key_value_ordered[key] 

857 count_nb_to_keep += 1 

858 if count_quantile_90 > 0.9 * total: 

859 break 

860 

861 print(" Kept count_nb_to_keep over : " + str(count_nb_to_keep) + " overs " + str(len(map_count_key_value_ordered))) 

862 

863 

864 if type_doc == "document_type": 

865 list_key_value_doc_type = ["cr_operation","cr_exam","cr_hospit","cr_urgence","courrier","facture_utile","facture","ordonnance","certif_blessure","certif_at","attestation","certif_medical","certif_hospitalisation","autre","ordonnance_medicament_exception","cr_pompier","facture_inutile"] 

866 else: 

867 list_key_value_doc_type = list(map_count_key_value_ordered.keys()) 

868 # [:count_nb_to_keep] 

869 print(" list_key_value_doc_type : " + str(list_key_value_doc_type)) 

870# type_doc 

871 # create dataframe with these list as columns and rows 

872 import pandas as pd 

873# df = pd.DataFrame(columns=list_key_value_doc_type) 

874 

875 # Now compute confusion matrix 

876 associated_labels = {} 

877 predicted_labels = {} 

878 list_associated_labels = list_key_value_doc_type + ["Unknown Label"] 

879 list_predicted_labels = list_key_value_doc_type + ["Unknown Label"] 

880 list_data_on_sen_fout = [] 

881 map_list_error = {} 

882 idx_voila = 0 

883 for idx in auto_res.index: 

884 data = auto_res.loc[idx] 

885 manual_val = data["manual_val"] 

886 auto_val = data["auto_val"] 

887 idx_voila += 1 # data["hit_id_row"] 

888 associated_labels[idx_voila] = manual_val 

889 predicted_labels[idx_voila] = auto_val 

890 if auto_val != manual_val: 

891 key_diff = str(auto_val) + "P" + str(manual_val) 

892 if key_diff not in map_list_error: 

893 map_list_error[key_diff] = [] 

894 

895 map_list_error[key_diff].append((data["hit"], data["id_row"])) 

896 

897 if type_doc == "document_type": 

898 

899 from pyfvs.lib.advanced.lib_confusion_matrix import compute_confusion_matrix 

900 mat_conf = compute_confusion_matrix(associated_labels, predicted_labels, 

901 list_associated_labels, list_predicted_labels, list_data_on_sen_fout = []) 

902 print("mat_conf : " + str(mat_conf)) 

903 print(mat_conf) 

904 with open("mat_conf_" + type_doc + ".html", "w") as f: 

905 f.write(str(mat_conf.to_html())) 

906 print(" mat_conf.txt written") 

907 

908 print("<br><br> Error type_doc : " + type_doc + "<br><br>") 

909 for d in list_associated_labels: 

910 for p in list_predicted_labels: 

911 if mat_conf.at[p, d] > 0: 

912 key_list_diff = str(p) + "P" + str(d) 

913 if key_list_diff in map_list_error: 

914 print("<br>mat_conf.at[" + d + "," + p + "] : " + str(mat_conf.at[p, d])) 

915 for data in map_list_error[key_list_diff]: 

916 print("<br><a href='https://safia.app/manax?hash_id_treatment=" + str(data[0]) + "&id_line=" + str(data[1]) + "'> LINE " + str(data[1]) + " " + str(p) + " to " + str(d) + " </a>") 

917 # print("mat_conf.at[" + d + "," + p + "] : " + str(mat_conf.at[p, d])) 

918 # for data in auto_res: 

919 # if data["auto_val"] == p and data["manual_val"] == d: 

920 # print("data : " + str(data)) 

921 else: 

922 print("<br>\n".join(list(map_list_error.keys()))) 

923 

924 print("TO CHECK WIP 15-8-24") 

925 

926# --job=saxia.stat_quali --limit=200 -v --project_id=94 --in_file=condition_intro_doc,only_correct_prediag,condition_only_one_page 

927 

928def study_qualite(lpgss = None, 

929 type_doc = "document_type", 

930 verbose = False, 

931 condition_query = "TODO", 

932 limit = 100, 

933 project_id = None, 

934 condition_intro_doc = True, 

935 only_correct_prediag = True, 

936 condition_only_one_page = True, 

937 list_to_study = "document_type,medecin_nom,medecin_specialite,datet", 

938 with_out_folder = False, 

939 prepare_data_set = False): 

940 

941 if lpgss == None: 

942 print("Missing DB connector") 

943 return None 

944 

945 list_dataset = [] 

946 

947 map_list_input_by_document = {} 

948 list_complete = [] 

949 if condition_intro_doc == True: 

950 if project_id == None: 

951 load_conf_from_project_id = 91 

952 else: 

953 load_conf_from_project_id = project_id 

954 

955 from lib.lib_safia_system import LibSafiaSystem 

956 lss = LibSafiaSystem(lib_user_data_internal=lpgss) 

957 raw_configuration = lss.load_conf_project(load_conf_from_project_id) 

958 from lib.util.lib_formal_conf import formal_conf_prepare 

959 configuration = formal_conf_prepare(raw_configuration, lss) 

960 intro_format_intro = load_sub_json(configuration, "saxia/format/info_format_intro/format/intro") 

961 

962 from lib.lib_util import compute_list_input_to_format_per_document 

963 map_list_input_by_document = compute_list_input_to_format_per_document(intro_format_intro) 

964 # On a aussi besoin de la liste complète des types de document, donc on veut faire une concatenation des listes qui sont les valeurs du amp_list_input_by_document 

965 list_complete = [] 

966 for key in map_list_input_by_document: 

967 list_complete += map_list_input_by_document[key] 

968 list_complete = list(set(list_complete)) 

969 

970 from lib.lib_util import count_and_display_elapsed_time 

971 import time 

972 begin_time = time.time() 

973 

974 all_df_cons = lpgss.load_df_cons(limit = limit, project_id = project_id, verbose=verbose, with_out_folder=False) #with_out_folder) 

975 list_mhit = [] 

976 map_mhit_outfolder = {} 

977 for data in all_df_cons: 

978 if "mhit" in data and data["mhit"] != None: 

979 list_mhit.append(data["mhit"]) 

980 if with_out_folder: 

981 list_mhit_out_folder = lpgss.load_output_folder_from_mhit(list_mhit, verbose=verbose) 

982 for r in list_mhit_out_folder: 

983 if "mhit" in r and "out_folder" in r: 

984 if r["out_folder"] != None and r["mhit"] != None : 

985 map_mhit_outfolder[r["mhit"]] = r["out_folder"] + "/" + r["mhit"] 

986 else: 

987 print("Warning : missing out_folder for mhit " + str(r)) 

988 begin_time, message = count_and_display_elapsed_time(begin_time, "load_df_cons ") 

989 big_df = None 

990 import pandas as pd 

991 map_id_file_count_nan = {} 

992 

993 for data in all_df_cons: 

994 one_df = pd.DataFrame(data["df_cons"]) 

995 one_df["id_doc"] = 0 

996 one_df["id_mde"] = 0 

997 one_df["image_path"] = "" 

998 id_file = data["id_file"] 

999 out_folder = data["out_folder"] if "out_folder" in data else "" 

1000 mhit = data["mhit"] if "mhit" in data else None 

1001 if mhit != None and mhit in map_mhit_outfolder: 

1002 out_folder = map_mhit_outfolder[mhit] 

1003 id_mde = data["id"] if "id" in data else None 

1004 one_df["id_file" ] = id_file 

1005 map_id_file_count_nan[id_file] = 0 

1006 for idx, row in one_df.iterrows(): 

1007 if row["id"] == "NaN" or row["id"] == "" or not str(row["id"]).isdigit(): 

1008 if row["Liste des pages"] != "": 

1009 map_id_file_count_nan[id_file] += 1 

1010# print("Warning : id is NaN or empty in one_df, skipping this row : " + str(row)) 

1011 continue 

1012 one_df.at[idx, "id"] = 1000 * id_mde + int(row["id"]) 

1013 one_df.at[idx, "id_mde"] = id_mde 

1014 one_df.at[idx, "id_doc"] = int(row["id"]) 

1015 one_df.at[idx, "image_path"] = out_folder + "/page_" + str(row["Liste des pages"]) + ".png" if out_folder != "" else "" 

1016 str_id = str(id_file) + "_l_" + str(row["id"]) 

1017 # one_df.at[idx, "id"] = str_id 

1018 

1019 if big_df is None: 

1020 big_df = one_df 

1021 else: 

1022 big_df = pd.concat([big_df, one_df], ignore_index=True) 

1023 

1024 print("map_id_file_count_nan[id_file] " + str(map_id_file_count_nan[id_file]) + " rows with problem and non null Liste des Pages") 

1025 

1026 print(len(big_df), " rows in big_df") 

1027 

1028 # On ne va garder que les lignes qui ont "document_type" et "Liste des pages" non vides 

1029 if "document_type" in big_df.columns and "Liste des pages" in big_df.columns: 

1030 big_df = big_df[big_df["document_type"].notna() & big_df["Liste des pages"].notna()] 

1031 print(len(big_df), " rows in big_df") 

1032 big_df = big_df[big_df["document_type"] != ''] 

1033 print(len(big_df), " rows in big_df") 

1034 big_df = big_df[big_df["Liste des pages"] != ''] 

1035 

1036 print(len(big_df), " rows in big_df") 

1037 

1038 big_df["nb_word_cr"] = -1 

1039 big_df["nb_word_quantile"] = -1 

1040 map_list_quantile = {5:"0_5",20:"5_20",50:"20_50",150:"50_150",1000:"150_1000",10000:"1000_infini"} 

1041 for idx in big_df.index: 

1042 data = big_df.loc[idx] 

1043 if "cr_correct_typo" in data and not pd.isna(data["cr_correct_typo"]): 

1044 nb_word_cr = len(str(data["cr_correct"]).split()) 

1045 big_df.at[idx, "nb_word_cr"] = nb_word_cr 

1046 for q in map_list_quantile: 

1047 if nb_word_cr < q: 

1048 break 

1049 big_df.at[idx, "nb_word_quantile"] = map_list_quantile[q] 

1050 

1051 big_df.to_csv('big_df.csv', sep='\t') 

1052 begin_time, message = count_and_display_elapsed_time(begin_time, "build big_df ") 

1053 

1054 print("big_df : " + str(big_df.head(10))) 

1055 big_df = big_df.reset_index(drop=True) 

1056# big_df["ERROR"] = 'COL' 

1057 

1058 # Initialisation d'un dataframe avec les memes colonnes que big_df 

1059 df_to_study = pd.DataFrame(columns=big_df.columns) 

1060 # Rajout de la première ligne 

1061 #df_to_study = pd.concat([df_to_study, big_df.iloc[0:0]], ignore_index=True) 

1062 

1063 list_one_dim_distribution = ["document_type", "medecin_specialite", 

1064 "document_type_auto", "medecin_specialite_auto", 

1065 "Nombre de pages", "prediag", "Liste des pages", "nb_word_quantile"] 

1066 for c in list_one_dim_distribution: 

1067 if c not in big_df.columns: 

1068 print("Column " + c + " not in big_df, skipping") 

1069 continue 

1070 print("Processing column STUDY ONE DIM " + c) 

1071 map_val_nb = {} 

1072 count_total = 0 

1073 count_empty = 0 

1074 count_not_empty = 0 

1075 

1076 for idx in big_df.index: 

1077 data = big_df.loc[idx] 

1078 count_total += 1 

1079 val = data[c] if c in big_df.columns else None 

1080 if pd.isna(val) or val == "": 

1081 count_empty += 1 

1082 else: 

1083 count_not_empty += 1 

1084 if type(val) == list: 

1085 print("ERROR: value is a list, converting to string for counting: " + str(val)) 

1086 val = str(val) 

1087 if val not in map_val_nb: 

1088 map_val_nb[val] = 0 

1089 map_val_nb[val] += 1 

1090 # Display distribution 

1091 print("Distribution for column " + c + ":") 

1092 # Order by nb decreasing 

1093 map_val_nb = dict(sorted(map_val_nb.items(), key=lambda item: item[1], reverse=True)) 

1094 for val in map_val_nb: 

1095 if count_total == 0: 

1096 print(f"{val}: {map_val_nb[val]} (0.00%)") 

1097 else: 

1098 print(f"{val}: {map_val_nb[val]} ({round(map_val_nb[val] / count_total * 100, 2)}%)") 

1099 

1100 # Affiche les résultats 

1101 print("Column " + c + " :") 

1102 print("Total rows: " + str(count_total)) 

1103 if count_total == 0: 

1104 print("Empty auto: 0 (0.00%)") 

1105 print("Not empty auto: 0 (0.00%)") 

1106 else: 

1107 print("Empty auto: " + str(count_empty) + " (" + str(round(count_empty / count_total * 100, 2)) + "%)") 

1108 print("Not empty auto: " + str(count_not_empty) + " (" + str(round(count_not_empty / count_total * 100, 2)) + "%)") 

1109 print("") 

1110 

1111 # On va maintenant faire un tableau de différentes paires de colonnes 

1112 list_paires = [("document_type", "prediag"), ("document_type", "nb_word_quantile"), ("prediag", "nb_word_quantile")] 

1113 for (col1, col2) in list_paires: 

1114 if col1 not in big_df.columns or col2 not in big_df.columns: 

1115 print("Column " + col1 + " or " + col2 + " not in big_df, skipping") 

1116 continue 

1117 print("Processing column pair STUDY TWO DIM " + col1 + " and " + col2) 

1118 map_val_nb = {} 

1119 map_val1_nb = {} 

1120 count_total = 0 

1121 count_empty = 0 

1122 count_not_empty = 0 

1123 

1124 for idx in big_df.index: 

1125 data = big_df.loc[idx] 

1126 count_total += 1 

1127 val1 = data[col1] 

1128 val2 = data[col2] 

1129 if pd.isna(val1) or val1 == "" or pd.isna(val2) or val2 == "": 

1130 count_empty += 1 

1131 else: 

1132 count_not_empty += 1 

1133 if str(val1) not in map_val_nb: 

1134 map_val_nb[str(val1)] = {} 

1135 map_val1_nb[str(val1)] = 0 

1136 if str(val2) not in map_val_nb[str(val1)]: 

1137 map_val_nb[str(val1)][str(val2)] = 0 

1138 key = str(val1) + "_" + str(val2) 

1139 

1140 map_val_nb[str(val1)][str(val2)] += 1 

1141 map_val1_nb[str(val1)] += 1 

1142 

1143 # Display distribution 

1144 print("Distribution for columns " + col1 + " and " + col2 + ":") 

1145 # Order by nb decreasing 

1146# map_val_nb = dict(sorted(map_val_nb.items(), key=lambda item: item[1], reverse=True)) 

1147 for k1 in map_val_nb: 

1148 print(f"{k1}: {map_val1_nb[k1]} ({round(map_val1_nb[k1] / count_total * 100, 2)}%) ") 

1149 for k2 in map_val_nb[k1]: 

1150 import sys 

1151 sys.stdout.write(f"{k2} : {map_val_nb[k1][k2]} ({round(map_val_nb[k1][k2] / map_val1_nb[k1] * 100, 2)}%) ") 

1152 print("") 

1153 

1154 # Affiche les résultats 

1155 print("Columns " + col1 + " and " + col2 + ":") 

1156 print("Total rows: " + str(count_total)) 

1157 print("Empty auto: " + str(count_empty) + " (" + str(round(count_empty / count_total * 100, 2)) + "%)") 

1158 print("Not empty auto: " + str(count_not_empty) + " (" + str(round(count_not_empty / count_total * 100, 2)) + "%)") 

1159 print("") 

1160 

1161 

1162 

1163 list_comparison = [] 

1164 

1165 # Calcul des taux d'erreur et de correct pour chacune des colonnes 

1166 # , 'Nombre de pages' 

1167 list_comparison.append(("intro_correct_typo", "intro_back")) 

1168 for col_ref in ["medecin_nom", "medecin_prenom", "document_type", 'Liste des pages', 'Titre', 

1169 'medecin_specialite', 'nom_hopital', 'genre_service_hopital', 

1170 'indication_examen', 'date_entree_hospitalisation', 

1171 'date_sortie_hospitalisation', 'motif_hospitalisation', 

1172 'date_fin_arret_travail', 'TitreMeta', 'datet', 'date_parsed_or_forced', 

1173 'date_fin_arret_travailt', 'date_entree_hospitalisationt', 'date_sortie_hospitalisationt']: 

1174 col_auto = col_ref + "_auto" 

1175 col_manual = col_ref # + "_manual" 

1176 list_comparison.append((col_manual, col_auto)) 

1177 

1178 list_comparison.append(("cr_correct_typo", "cr_back")) 

1179 

1180 list_error_for_meta_data = [] 

1181 for (col_manual, col_auto) in list_comparison: 

1182 if col_auto not in big_df.columns or col_manual not in big_df.columns: 

1183 print("Column " + col_auto + " or " + col_manual + " not in big_df, skipping") 

1184 continue 

1185 list_errors = [] 

1186 list_commentaires_corrects = [] 

1187 list_commentaires_errors = [] 

1188 print("Processing column " + col_manual) 

1189 count_ignore = 0 

1190 count_total = 0 

1191 count_correct = 0 

1192 count_error = 0 

1193 count_empty = 0 

1194 count_not_empty = 0 

1195 count_not_empty_auto = 0 

1196 count_not_empty_manual = 0 

1197 for idx in big_df.index: 

1198 doc_type = big_df.loc[idx, "document_type"] 

1199 comms = big_df.loc[idx, "Commentaires"] 

1200 if doc_type in map_list_input_by_document: 

1201 list_input = map_list_input_by_document[doc_type] 

1202 if col_manual not in list_input and col_manual in list_complete and col_manual != 'document_type': 

1203 # print("Ignoring column " + col_manual + " for document type " + doc_type) 

1204 count_ignore += 1 

1205# list_error_for_meta_data.append(idx) 

1206 continue 

1207 else : 

1208 if condition_intro_doc: 

1209 print("MISSING DATA TO ANALYZE : " + str(doc_type)) 

1210 pass 

1211 if condition_only_one_page : 

1212 #if "Nombre de page" in big_df.columns : 

1213 if "Liste des pages_auto" in big_df.columns and "Liste des pages" in big_df.columns: 

1214 if str(big_df.loc[idx, "Liste des pages_auto"]).strip().isdigit() and str(big_df.loc[idx, "Liste des pages"]).strip().isdigit() and int(big_df.loc[idx, "Liste des pages"]) == int(big_df.loc[idx, "Liste des pages_auto"]): 

1215 pass 

1216 else : 

1217 if "," not in str(big_df.loc[idx, "Liste des pages_auto"]) or "," not in str(big_df.loc[idx, "Liste des pages"]): 

1218 if "none" != str(big_df.loc[idx, "Liste des pages_auto"]).lower(): 

1219 print("UNEXPECTED LISTE DES PAGES " + str(big_df.loc[idx, "Liste des pages_auto"]) + "||" + str(big_df.loc[idx, "Liste des pages"])) 

1220 else : 

1221 import sys 

1222 sys.stdout.write("ñø") 

1223 # Plusieurs pages 

1224 count_ignore += 1 

1225 list_error_for_meta_data.append(idx) 

1226 continue 

1227 else : 

1228 print("UNEXPECTED missing Nombre de page on document_type : " + str(doc_type) + " ") 

1229 # print("UNEXPECTED missing info intro on document_type : " + str(doc_type) + " ") 

1230 if only_correct_prediag: 

1231 if big_df.loc[idx, "prediag"] not in ("OK", "BON"): 

1232 if big_df.loc[idx, "prediag"] not in ("", "AUTRE", "CERFA_MANUSCRIT", "MANUSCRIT", "TABLEAU", "MAUVAIS", "PRESQUEBON"): 

1233 print("UNEXPECTED " + big_df.loc[idx, "prediag"]) 

1234 count_ignore += 1 

1235 list_error_for_meta_data.append(idx) 

1236 continue 

1237 else: 

1238 if big_df.loc[idx, "prediag"] == "": 

1239 count_ignore += 1 

1240 list_error_for_meta_data.append(idx) 

1241 continue 

1242 data = big_df.loc[idx] 

1243 count_total += 1 

1244 auto_val = data[col_auto] 

1245 manual_val = data[col_manual] 

1246 if pd.isna(auto_val) or auto_val == "": 

1247 count_empty += 1 

1248 else: 

1249 count_not_empty += 1 

1250 count_not_empty_auto += 1 

1251 if type(manual_val) == list or pd.isna(manual_val) or manual_val == "": 

1252 continue 

1253 count_not_empty_manual += 1 

1254 id_file = data["id_file"] 

1255 lp_id = data["Liste des pages"] 

1256 mde_id = int(data["id"]/ 1000) if type(data["id"]) != types.NoneType else "" 

1257 if str(auto_val).lower() == str(manual_val).lower(): 

1258 count_correct += 1 

1259 list_commentaires_corrects.append(comms) 

1260 else: 

1261 count_error += 1 

1262 list_error_for_meta_data.append(idx) 

1263 if idx in [13, 19, 25, 28, 29, 30, 31, 32, 34, 38, 41, 42, 43, 55]: 

1264 print("Unexpected wrond meta data but perfect cr_back !") 

1265 comms_start = comms[:32] if type(comms) == str else "" 

1266 list_errors.append((mde_id, lp_id, manual_val, auto_val, comms_start)) 

1267 list_commentaires_errors.append(comms) 

1268 if condition_intro_doc and doc_type in map_list_input_by_document and col_manual in map_list_input_by_document[doc_type]: 

1269 if idx not in list_error_for_meta_data: 

1270 list_error_for_meta_data.append(idx) 

1271 intro_correct = big_df.loc[idx, "intro_correct_typo"] 

1272 if not col_manual.startswith("date") and manual_val.lower() not in intro_correct.lower(): 

1273 print("Error in " + col_manual + " for " + doc_type + " : " + str(manual_val) + " not in " + str(intro_correct) + " for idx : " + str(idx)) 

1274 print("A ANALYSER") 

1275 if col_manual in list_to_study: 

1276 try: 

1277# big_df.iloc[idx, "ERROR"] = col_manual 

1278 # On va ajouter la ligne dans le dataframe df_to_study 

1279 big_df.loc[idx, "Commentaires"] = "DIFFERENCE in " + col_manual + " : " + comms 

1280 one_more_line = big_df.iloc[idx:idx + 1] 

1281 df_to_study = pd.concat([df_to_study, one_more_line], ignore_index=True) 

1282 print("Padam") 

1283 except Exception as e: 

1284 print("Error adding row to df_to_study for idx " + str(idx) + ": " + str(e)) 

1285 continue 

1286 

1287 

1288 # Affiche les résultats 

1289 print("Column " + col_manual + " :") 

1290 print("Ignored rows: " + str(count_ignore)) 

1291 print("Total rows: " + str(count_total)) 

1292 if count_total == 0: 

1293 print("Empty auto: 0 (0.00%)") 

1294 print("Not empty auto: 0 (0.00%)") 

1295 else: 

1296 print("Correct: " + str(count_correct) + " (" + str(round(count_correct / count_total * 100, 2)) + "%)") 

1297 print("Errors: " + str(count_error) + " (" + str(round(count_error / count_total * 100, 2)) + "%)") 

1298 print("Empty auto: " + str(count_empty) + " (" + str(round(count_empty / count_total * 100, 2)) + "%)") 

1299 print("Not empty auto: " + str(count_not_empty_auto) + " (" + str(round(count_not_empty_auto / count_total * 100, 2)) + "%)") 

1300 print("Not empty manual: " + str(count_not_empty_manual) + " (" + str(round(count_not_empty_manual / count_total * 100, 2)) + "%)") 

1301 print("Listes de paires d'erreurs") 

1302 print(str(list_errors)) 

1303 print("Listes de commentaires dans les cas d'erreurs ") 

1304 print(str(list_commentaires_errors)) 

1305 print("Listes de commentaires dans les cas corrects ") 

1306 print(str(list_commentaires_corrects)) 

1307 print("") 

1308 

1309 list_complete =list(big_df.index) # list(range(0, len(big_df))) 

1310 list_total_correct = [] #list(big_df.index) 

1311 for idx in list_complete: 

1312 if idx not in list_error_for_meta_data: 

1313 list_total_correct.append(idx) 

1314 

1315 print("Total correct : " + str(len(list_total_correct))) 

1316 print(str(list_total_correct)) 

1317 

1318 if condition_intro_doc: 

1319 list_all_correct = list(big_df.index[big_df["intro_correct_typo"] == big_df["intro_back"]]) + list_total_correct 

1320 for idx in list_all_correct: 

1321 print("idx : " + str(idx) + " intro_correct_typo : " + str(big_df.loc[idx, "id"])) 

1322 

1323 print(big_df.index[big_df["intro_correct_typo"] == big_df["intro_back"]]) 

1324 print(list_total_correct) 

1325 for idx in list_all_correct: 

1326 try : 

1327 data = big_df.loc[idx] 

1328 doc_type = data["document_type"] 

1329 list_needed_column = map_list_input_by_document.get(doc_type, []) 

1330 input_as_json = {} 

1331 for f in list_needed_column: 

1332 if f in data and not pd.isna(data[f]) and data[f] != "": 

1333 if f.startswith("date"): 

1334 import datetime 

1335 data_used = datetime.datetime.strptime(data[f][:10], "%Y-%m-%d") 

1336 else : 

1337 data_used = data[f] 

1338 input_as_json[f] = data_used 

1339 else: 

1340 input_as_json[f] = None 

1341 from lib.lib_util import format_one_res 

1342 if doc_type in intro_format_intro: 

1343 intro_from_manual_saved = format_one_res(input_as_json, intro_format_intro[doc_type], format_premier=False, format_date="%d %B %Y") 

1344 else: 

1345 print("Missing intro_format_intro for doc_type : " + str(doc_type)) 

1346 intro_from_manual_saved = "" 

1347 print(" idx : " + str(idx)) 

1348 print(" id : " + str(data["id"])) 

1349 print(" intro_from_manual_saved :" + str(intro_from_manual_saved)) 

1350 print(" intro_correct_typo : " + str(data["intro_correct_typo"])) 

1351 print(" intro_back : " + str(data["intro_back"])) 

1352 except Exception as e: 

1353 print("Error processing idx " + str(idx) + ": " + str(e)) 

1354 continue 

1355 

1356 if prepare_data_set: 

1357 for idx, data in big_df.iterrows(): 

1358 doc_type = data["document_type"] 

1359 if doc_type in map_list_input_by_document: 

1360 list_needed_column = map_list_input_by_document[doc_type] 

1361 else: 

1362 print("Missing doc_type in map_list_input_by_document : " + str(doc_type)) 

1363 continue 

1364 one_data_set = {} 

1365 if "image_path" not in data: 

1366 print("Missing image_path, did you use with_out_folder ? " + str(data) + " for doc_type : " + str(doc_type) + ", skipping this row.") 

1367 continue 

1368 # image_path 

1369 one_data_set["image_path"] = data["image_path"] if "image_path" in data else "" 

1370 json_extract = {} 

1371 for c in list_needed_column: 

1372 if c in data and not pd.isna(data[c]) and data[c] != "": 

1373 if c.startswith("date"): 

1374 try: 

1375 import datetime 

1376 json_extract[c] = datetime.datetime.strptime(data[c][:10], "%Y-%m-%d") 

1377 except Exception as e: 

1378 print("Error parsing date for " + c + ": " + str(e)) 

1379 json_extract[c] = "None" 

1380 else: 

1381 json_extract[c] = data[c] 

1382 else: 

1383 json_extract[c] = None 

1384 print("Missing data !") 

1385 continue 

1386 one_data_set["extract_meta_data"] = json_extract 

1387 list_dataset.append(one_data_set) 

1388 

1389 import datetime 

1390 jour_suffix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 

1391 list_dataset_out_file = "dataset_" + jour_suffix + ".json" 

1392 with open(list_dataset_out_file, "w") as f: 

1393 import json 

1394 # Convert list_dataset to JSON format 

1395 # Ensure that datetime objects are converted to strings 

1396 

1397 #f.write(str(list_dataset)) 

1398 f.write(json.dumps(list_dataset, default=str, indent=4)) 

1399 

1400 df_data = pd.DataFrame(list_dataset, columns=["image_path", "extract_meta_data"]) 

1401 list_dataset_out_file = "dataset_as_df_" + jour_suffix + ".csv" 

1402 df_data.to_csv(list_dataset_out_file, sep='\t', index=False) 

1403 

1404 from lib.lib_ml.lib_util_prepare_dataset import from_csv_create_json_dataset 

1405 from_csv_create_json_dataset(list_dataset_out_file, crop=False, 

1406 server_root="https://safia.app", 

1407 folder_root="", 

1408 download_or_get_local_file=False, # used only for crop ! 

1409 col_url_path="image_path", 

1410 col_text="extract_meta_data", 

1411 sep='\t') 

1412 

1413 

1414 df_to_study = df_to_study.reset_index(drop=True) 

1415 return df_to_study 

1416 

1417