Coverage for lib/manaudit/lib_datou

1import datetime

2import types

4import pandas.core.dtypes.generic

7# on pourrait extraire le datou des audit_info : il est juste au debut Ah ha ah !

8# Mais on a pas de fonction pour construire le datou à partir des json

9# on pourrait aussi extraire les man_audit_info du datou plutot que du audit, enfin bon !

10# il faut nécessairement chercher dans les datou, ou au moins le dict qui le représente le param json utile

12def parse_audit_info(audit_info, list_action_for_df_to_correct = [], map_modif_df_manual = {}):

13 try:

14 # En fait il suffit de trouver load_tab

15 if "io_exec" not in audit_info:

16 return {}, {}

18 panda_table_content_as_markdown = ""

19 panda_table_content_as_json = {}

20 data_to_parse_for_json = ""

21 suffix = ""

22 nb_manual_action_df_for_col_audit = 0

24 map_col_id_to_id_df = {}

26 for steps in audit_info["io_exec"]:

27 if not steps.isdigit():

28 continue

29 type_step = audit_info["io_exec"][steps]["datou_step"]

30 param_json = audit_info["io_exec"][steps]["param_json"]

31 input = audit_info["io_exec"][steps]["input"]

32 output = audit_info["io_exec"][steps]["output"]

33 if type_step == "load_tab":

34 if "assoc" in param_json:

35 print("Will maybe faile, we need to find the right assoc")

36 if panda_table_content_as_markdown != "":

37 print("Will maybe fail, since we have two load tab step")

38 panda_table_content_as_markdown = input["result"] if "result" in input else ""

39 if type_step == "image_to_text":

40 suffix = "google_ocr"

41 # param_json =

42 if type_step == "map_reduce":

43 print("We want the second map_reduce, like id_step 7, ou peut-etre n'importe laquelle le champt result de input ?? ")

44 if "text" in output:

45 data_to_parse_for_json = output["text"]

46 if type_step == "format":

47 if "df_complet_as_markdown" in output:

48 panda_table_content_as_markdown = output["df_complet_as_markdown"]

49 if "df_complet_as_json" in output:

50 panda_table_content_as_json = output["df_complet_as_json"]

51 pass

53 from lib.batch.lib_batch import create_pandas_table_from_text

55 df = create_pandas_table_from_text(panda_table_content_as_markdown)

56 import pandas as pd

57 if panda_table_content_as_json != {} and type(panda_table_content_as_json) != types.NoneType and panda_table_content_as_json != "":

58 df = pd.read_json(panda_table_content_as_json, convert_dates=["datet", "date_entree_hospitalisationt", "date_sortie_hospitalisationt", "date_fin_arret_travailt", "date_debut_arret_travailt"])

59 if type(df) == type(None):

60 print("pd none from : " + str(panda_table_content_as_markdown))

61 df = pd.DataFrame(columns=["Null Column"])

62 else:

63 df["intro_correct"] = "Le ..."

64 df["cr_correct"] = "Le ..."

65 df["cr_correct_typo"] = ""

66 df["intro_correct_typo"] = ""

67 df["cr_front_init"] = "init"

68 df["intro_front_init"] = "init"

69 df["auto_state"] = """<input type="checkbox" value="" name="bordered-checkbox" class="w-4 h-4 text-blue-600 bg-gray-100 border-gray-300 rounded focus:ring-blue-500 dark:focus:ring-blue-600 dark:ring-offset-gray-800 focus:ring-2 dark:bg-gray-700 dark:border-gray-600">Automation impossible"""

70 df["auto_state_val"] = "false"

71 df["class_vert_rouge_valider"] = "false"

73 from lib.lib_util import parse_json_from_prompt_result, \

74 complete_date_and_order_json_to_mettre_en_forme, append_id_by_order

75 list_json_to_mettre_en_forme = parse_json_from_prompt_result(data_to_parse_for_json)

76 list_json_to_mettre_en_forme = append_id_by_order(list_json_to_mettre_en_forme)

78 # if order_by_date:

79 # list_json_to_mettre_en_forme = complete_date_and_order_json_to_mettre_en_forme(list_json_to_mettre_en_forme)

81 from lib.lib_util import add_parsing_meta_info_to_table

82 # VR 29-3-24 : a mon avis cela a deja été réalisé

83# df = add_parsing_meta_info_to_table(df, list_json_to_mettre_en_forme)

85 if len(list_action_for_df_to_correct) > 0:

86 for action in list_action_for_df_to_correct:

87 type_action = action["type_action"] if "type_action" in action else "default"

88 if type_action == "df_meta_info_correct":

89 key_action = create_key_action(action)

90 count = (map_modif_df_manual[key_action]["count"] + 1) if key_action in map_modif_df_manual and "count" in map_modif_df_manual[key_action] else 1

91 map_modif_df_manual[key_action] = action

92 map_modif_df_manual[key_action]["count"] = count

93 map_modif_df_manual[key_action]["audit_or_datou"] = "audit"

95 id_line = action["id"]

96 if id_line == 'NaN':

97 continue

98 manual_value = action["manual_value"]

99 col_name = action["col_name"]

100 # refacto renommer front_value et rajouter un

101 manual = bool(action["manual"]) if "manual" in action else False

102 if manual :

103 nb_manual_action_df_for_col_audit += 1

104 #df.loc[int(id_line)][col_name] = manual_value

105 try :

106 # maanging incomplete parsing of document => not clear why it creates this but there are some link in audit bug

107 id_line_int = int(str(id_line).replace(".0", ""))

108 except Exception as e:

109 print(" Pb with action id_line : " + str(id_line) + " " + str(e) + " parsing as float then int !")

110 if id_line != 'None':

111 print("ERROR treated as warning need to be audited")

112 id_line_int = int(float(id_line))

113 else :

114 continue

115

116 if map_col_id_to_id_df == {}:

117 for i in range(len(df)):

118 idx_df = df.index[i]

119 idx_from_data = df.loc[idx_df]["id"]

120 if str(idx_from_data).replace(".0", "").isdigit():

121 map_col_id_to_id_df[str(idx_from_data).replace(".0", "")] = idx_df

122 else :

123 print("ERROR should fail, wrong data will be saved ! " + str(idx_from_data))

124 if str(idx_df).isdigit():

125 map_col_id_to_id_df[str(idx_df)] = idx_df

126 if str(id_line).replace(".0", "") in map_col_id_to_id_df:

127 id_line_df = map_col_id_to_id_df[str(id_line).replace(".0", "")]

128 # df.iloc[id_line_df][col_name] = manual_value

129 df.loc[id_line_df, col_name] = manual_value

130 # df.loc[:,('one','second')]

131 else:

132 print(" Missing id line in df : " + str(id_line) + " in " + str(map_col_id_to_id_df.keys()))

133

134# df.loc[id_line_int, col_name] = manual_value

135 else:

136 print("type action not supported : " + str(type_action))

137

138 df_to_html = df.to_html(classes="table pdt-table table-striped", escape=False)

139

140 # TODO VR : a mettre dans la html clairement ! contrairement à ce que dit chat gpt

141 return {"dataframe_html" : df_to_html,

142 "dataframe_text" : panda_table_content_as_markdown,

143 "suffix" : suffix,

144 "df" : df,

145 "paragraphs" : []}, {"nb_manual_action_df_for_col_audit" : nb_manual_action_df_for_col_audit, "map_modif_df_manual" : map_modif_df_manual}

146 except Exception as e:

147 print(str(e))

148 return {"dataframe_html": None,

149 "dataframe_text": "",

150 "suffix": ""}, {"nb_manual_action_df_for_col_audit" : nb_manual_action_df_for_col_audit, "map_modif_df_manual" : map_modif_df_manual}

151

152# dataframe_html = df.to_html()

153

154def load_sub_json(main_json, key_list_slash):

155 try:

156 if key_list_slash == "":

157 return main_json

158 separator = "/"

159 # if "/" not in key_list_slash and "." in key_list_slash:

160 # separator = "."

161 list_keys = key_list_slash.lstrip(separator).split(separator)

162 sub_json = main_json

163 for k in list_keys:

164 if type(sub_json) == list:

165 if not k.isdigit():

166 print("Unexpected index for array in load_sub_json : " + str(k))

167 return None

168 k_int = int(k)

169 if k_int >= len(sub_json):

170 print("Not enough value in array in load_sub_json")

171 return None

172 sub_json = sub_json[k_int]

173 elif k in sub_json: # expected type dict

174 sub_json = sub_json[k]

175 # if type(sub_json) not in (dict, list):

176 # return sub_json

177 else:

178 # Dict with key as integer, so we need to convert k to int used in io_exec

179 if k.isdigit():

180 if int(k) in sub_json:

181 sub_json = sub_json[int(k)]

182 else:

183 print(" missing key " + k + " in " + str(sub_json.keys()))

184 return None

185 return sub_json

186 except Exception as e:

187 print("Exception in load_sub_json")

188 print(str(e))

189 return None

190

191def set_sub_json(main_json, key_list_slash, value):

192 separator = "/"

193# if "/" not in key_list_slash and "." in key_list_slash:

194# separator = "."

195 list_keys = key_list_slash.lstrip(separator).split(separator)

196 if key_list_slash.lstrip(separator) == "":

197 return value

198 sub_json = main_json

199 for k in list_keys[:-1]:

200 if type(sub_json) == list:

201 if not k.isdigit():

202 print("Unexpected index for array in load_sub_json : " + str(k))

203 return None

204 k_int = int(k)

205 if k_int >= len(sub_json):

206 print("Not enough value in array in load_sub_json")

207 return None

208 sub_json = sub_json[k_int]

209 elif k in sub_json:

210 sub_json = sub_json[k]

211 else:

212 print(" missing key " + k + " in " + str(sub_json.keys()))

213 sub_json[list_keys[-1]] = value

214 return main_json

215

216def create_key_action(action):

217 if "type_action" not in action or action["type_action"] != "df_meta_info_correct" or "col_name" not in action or "id" not in action:

218 return None

219 return str(action["id"]) + "_" + str(action["col_name"])

220

221def is_numericf(input):

222 if input == "NaN":

223 return False

224 try:

225 float(input)

226 return True

227 except ValueError:

228 return False

229 except Exception as e:

230 print("Unexpected error treated as warning")

231 print(str(e))

232 return False

233

234

235

236def count_time_lab_by_user(all_result_manual_correct, delta_min_between_save_minutes = 10):

237 map_user_id_time_modif = {}

238 try:

239 for result in all_result_manual_correct:

240 user_id = result["user_id"] if "user_id" in result else 0

241 if user_id not in map_user_id_time_modif:

242 map_user_id_time_modif[user_id] = []

243 if "created_at" in result and type(result["created_at"]) == datetime.datetime:

244 map_user_id_time_modif[user_id].append(result["created_at"])

245 except Exception as e:

246 print(str(e))

247

248 map_result_time_by_user = {}

249

250 for user_id in map_user_id_time_modif:

251 map_user_id_time_modif[user_id] = sorted(list(set(map_user_id_time_modif[user_id])))

252 count_minutes = 0

253 list_intervals = []

254 previous_time = None

255 for time_modif in map_user_id_time_modif[user_id]:

256 if previous_time == None:

257 previous_time = time_modif

258 list_intervals.append([time_modif, time_modif])

259 else:

260 delta = time_modif - previous_time

261 delta_in_minutes = delta.total_seconds() / 60.0

262 if delta_in_minutes <= delta_min_between_save_minutes:

263 # same interval

264 list_intervals[-1][1] = time_modif

265 else:

266 list_intervals.append([time_modif, time_modif])

267 previous_time = time_modif

268

269 for interval in list_intervals:

270 delta = interval[1] - interval[0]

271 delta_in_minutes = delta.total_seconds() / 60.0

272 count_minutes += delta_in_minutes + delta_min_between_save_minutes

273 map_result_time_by_user[user_id] = {"intervals" : list_intervals,

274 "total_minutes" : count_minutes}

275

276 return map_result_time_by_user

277

278

279

280def list_action_by_user(all_result_manual_correct, df_complete):

281 map_user_id_list_pages = {}

282 map_user_id_list_pages_for_split = {}

283 map_user_id_time_modif = {}

284 try:

285 for result in all_result_manual_correct:

286 if "manual_input_info" not in result or "list_actions" not in result["manual_input_info"]:

287 print("Warning missing actions in result : " + str(result))

288 continue

289 user_id = result["user_id"] if "user_id" in result else 0

290 if user_id not in map_user_id_list_pages:

291 map_user_id_list_pages[user_id] = []

292 map_user_id_time_modif[user_id] = []

293 map_user_id_list_pages_for_split[user_id] = []

294 if "created_at" in result and type(result["created_at"]) == datetime.datetime:

295 map_user_id_time_modif[user_id].append(result["created_at"])

296 for action in result["manual_input_info"]["list_actions"]:

297 if "id_page" in action:

298 if not str(action["id_page"]).isdigit():

299 print("How come this id_page isn't an int : " + str(action["id_page"]))

300 continue

301 id_page = int(action["id_page"])

302 if id_page not in map_user_id_list_pages[user_id]:

303 map_user_id_list_pages[user_id].append(id_page)

304 if "col_name" in action and action["col_name"] == "Liste des pages":

305 if id_page not in map_user_id_list_pages_for_split[user_id]:

306 map_user_id_list_pages_for_split[user_id].append(id_page)

307 elif "id" not in action or not is_numericf(str(action["id"])):

308 print("Warning missing id in action : " + str(action))

309 continue

310 else:

311 id = int(float(action["id"]))

312 list_pages = df_complete.loc[df_complete["id"] == id, "Liste des pages"].values[0] if "Liste des pages" in df_complete.columns else ""

313 if list_pages == "" or not list_pages.replace(",", "").isdigit():

314 print("Warning missing list_pages in df_complete for id : " + str(id))

315 continue

316 list_pages_as_list_int = [int(x) for x in list_pages.split(",") if x.strip().isdigit()]

317 map_user_id_list_pages[user_id].extend(list_pages_as_list_int)

318 if "col_name" in action and action["col_name"] == "Liste des pages":

319 map_user_id_list_pages_for_split[user_id].extend(list_pages_as_list_int)

320 for user_id in map_user_id_list_pages:

321 map_user_id_list_pages[user_id] = sorted(list(set(map_user_id_list_pages[user_id])))

322 for user_id in map_user_id_list_pages_for_split:

323 map_user_id_list_pages_for_split[user_id] = sorted(list(set(map_user_id_list_pages_for_split[user_id])))

324 except Exception as e:

325 print(str(e))

326

327 return map_user_id_list_pages, map_user_id_time_modif, map_user_id_list_pages_for_split

328

329def modify_audit_info_from_manual_correction(audit_info, all_result_manual_correct,

330 manual_action_to_audit_data = {},

331 project_id = None,

332 lib_user_data_internal = None):

333 if manual_action_to_audit_data == {}:

334 if lib_user_data_internal == None:

335 print (" We need access to database to get the conf_project and saxia param")

336 else :

337 print(" We will load the configuration saxia from the project_id")

338 if project_id == None:

339 print("Error project_id is None and no manual_action_to_audit_data")

340

341 # We can still try to get the project_id from the hash_id_token, but won't do it right now

342 hash_id_treatment = audit_info["config"]["complete_param_json"]["hash_id_treatment"] if "config" in audit_info and "complete_param_json" in audit_info["config"] and "hash_id_treatment" in audit_info["config"]["complete_param_json"] else None

343 all_result = lib_user_data_internal.load_data_audit(hash_id_treatment=hash_id_treatment)

344 # print(" all_result : " + str(all_result.keys()))

345 project_id = all_result["project_id"] if "project_id" in all_result else None

346

347 if project_id == None:

348 print(" We can't do anything")

349 else :

350 conf_project = lib_user_data_internal.load_conf_project(project_id)

351 saxia_conf = conf_project["saxia"] if "saxia" in conf_project else {}

352 assoc_conf = saxia_conf["assoc"] if "assoc" in saxia_conf else {}

353 manual_action_to_audit_data = assoc_conf["manual_action_to_audit_data"] if "manual_action_to_audit_data" in assoc_conf else {}

354

355 nb_modif_class_manual = 0

356 nb_manual_action_df = 0

357 map_modif_df_manual = {} # permettrait d'ailleurs une projection

358 paragraphs = []

359 df_auto = None

360 try:

361 # key_as_list_slash = "io_exec/0/output/paragraphs"

362 key_as_list_slash = "io_exec/3/input/paragraphs"

363 key_as_list_slash = manual_action_to_audit_data["class_paragraph"]["audit_data"] if "class_paragraph" in manual_action_to_audit_data and "audit_data" in manual_action_to_audit_data["class_paragraph"] else key_as_list_slash

364 audit_info_corrected = audit_info

365

366 map_col_id_to_id_df = {}

367

368

369 list_action_for_df_to_correct = []

370 paragraphs = load_sub_json(audit_info_corrected, key_as_list_slash)

371 # order modification by increasing date

372 all_result_manual_correct_ordered = sorted(all_result_manual_correct, key=lambda x: x["created_at"])

373 map_count_modif_per_doc = {}

374 map_modif_type_document = {}

375 for result in all_result_manual_correct_ordered:

376 created_at_as_string = result["created_at"].strftime("%y%m%d_%H:%m:%S")

377 print(" Modification from created_at_as_string :" + created_at_as_string + " result id : " + str(result["id"]))

378 manual_input_info = result["manual_input_info"] if "manual_input_info" in result else {}

379 list_actions = manual_input_info["list_actions"] if "list_actions" in manual_input_info else []

380 for action in list_actions:

381 type_action = action["type_action"] if "type_action" in action else "default"

382 if type_action == "class_paragraph":

383 nb_modif_class_manual += 1

384 # class_action stocke un id_page au lieu d'un id_file => impact important pour corriger, donc on va sans doute garder comme cela 8/4/24

385 id_page = action["id_page"]

386 if id_page not in map_count_modif_per_doc:

387 map_count_modif_per_doc[id_page] = 0

388 map_count_modif_per_doc[id_page] += 1

389 id_paragraph = action["id_paragraph"] if "id_paragraph" in action else None

390 manual_class = action["manual_class"] + "_class" if "manual_class" in action else None

391 manual_class.replace("_class_class", "_class")

392 id_page_int = int(id_page)

393 if id_paragraph == None:

394 print("ERROR due certainly to modified crops modified again by switch 2025-05 ")

395 continue

396 id_paragraph_int = int(id_paragraph)

397 if id_page_int < len(paragraphs):

398 if id_paragraph_int < len(paragraphs[id_page_int]):

399 paragraphs[id_page_int][id_paragraph_int]["class"] = manual_class

400 else:

401 print("ERROR id_paragraph_int greater than len(paragraphs[id_page_int])" + str(id_paragraph_int) + " nb : " + str(len(paragraphs[id_page_int])) + " for " + str(id_page_int))

402 continue

403 else :

404 print("ERROR id_page_int greater than len(paragraphs)" + str(id_page_int) + " nb : " + str(len(paragraphs)))

405 continue

406

407# audit_info_corrected["io_exec"]["3"]["input"]["paragraphs"] = paragraphs

408

409 elif type_action == "df_meta_info_correct":

410 key_action = create_key_action(action)

411 count = (map_modif_df_manual[key_action]["count"] + 1) if key_action in map_modif_df_manual and "count" in map_modif_df_manual[key_action] else 1

412 map_modif_df_manual[key_action] = action

413 map_modif_df_manual[key_action]["count"] = count

414 map_modif_df_manual[key_action]["audit_or_datou"] = "datou"

415

416 # list_action_for_df_to_correct.append(action)

417 if True:

418 key_for_df_as_md = "io_exec/6/output" # df_complet_as_markdown

419 key_for_df_as_md = "io_exec/9/output/df_complet_as_markdown" # df_complet_as_markdown # C'est 9 en prod grrr => ca va pas du tout grrr

420 key_for_df_as_md = manual_action_to_audit_data["df_meta_info_correct"]["audit_data"] if "df_meta_info_correct" in manual_action_to_audit_data and "audit_data" in \

421 manual_action_to_audit_data["df_meta_info_correct"] else key_for_df_as_md

422 df_as_md = load_sub_json(audit_info_corrected, key_for_df_as_md)

423 key_for_df_as_json = "io_exec/9/output/df_complet_as_json"

424 key_for_df_as_json = manual_action_to_audit_data["df_meta_info_correct_json"]["audit_data"] if "df_meta_info_correct_json" in manual_action_to_audit_data and "audit_data" in \

425 manual_action_to_audit_data["df_meta_info_correct_json"] else key_for_df_as_json

426 df_as_json = load_sub_json(audit_info_corrected, key_for_df_as_json)

427

428

429 if df_as_json != None:

430 import pandas as pd

431 df = pd.read_json(df_as_json, convert_dates=["datet", "date_entree_hospitalisationt", "date_sortie_hospitalisationt", "date_fin_arret_travailt", "date_debut_arret_travailt"])

432

433 #from copy import deepcopy

434 if type(df_auto) == types.NoneType:

435 df_auto = df.copy() # deep=True

436 #df_auto = deepcopy(df_as_json)

437

438 elif df_as_md != None:

439 print("ERROR SINCE 06-2024 : WE NEVER GO THROUGH THIS !?! ")

440 # from lib.batch.lib_batch import create_pandas_table_from_text

441 # df = create_pandas_table_from_text(df_as_md, verbose = False)

442 print("ON 08-2024 : WE COMMENT THE TWO PREVIOUS LINE AND RETURN => LOAD AUDIT WILL CRASH ")

443 return None

444 else:

445 print("ERROR FROM THE START ")

446 from lib.batch.lib_batch import init_df_synchronize

447 df = init_df_synchronize() #pandas.core.dtypes.generic.create_pandas_abc_type("DataFrame", [])

448 if action["col_name"] in df.columns:

449 id_line = action["id"]

450 from lib.manaudit.lib_datou_audit import is_numericf

451 if not is_numericf(id_line):

452# if id_line == "" or (id_line != "0" and not str(id_line).rstrip(".0").isdigit()):

453 print(" action not treated since input data is wrong : " + str(id_line) + " " + str(action))

454 continue

455 manual_value = action["manual_value"]

456 col_name = action["col_name"]

457 if col_name == "datet":

458 #print("Will maybe fail : " + str(action))

459 if type(df.loc[0,"datet"]) != pandas._libs.tslibs.timestamps.Timestamp and type(df.loc[0,"datet"]) != datetime.datetime and type(df.loc[0,"datet"]) != pandas._libs.tslibs.nattype.NaTType:

460 print("Will fail now : " + str(action) + " result id : " + str(result["id"]))

461 print(str(type(df.loc[0,"datet"])))

462 if col_name == "date_entree_hospitalisationt" or col_name == "date_sortie_hospitalisationt" or col_name == "date_fin_arret_travailt" or col_name == "date_debut_arret_travailt" or col_name == "datet":

463 import dateparser

464 manual_value_parsed = dateparser.parse(manual_value)

465 print(manual_value_parsed)

466 if manual_value_parsed == None:

467 continue

468 if col_name == "document_type":

469 map_modif_type_document[id_line] = manual_value

470 if map_col_id_to_id_df == {}:

471 for i in range(len(df)):

472 idx_df = df.index[i]

473 idx_from_data = df.loc[idx_df]["id"]

474 if str(idx_from_data).replace(".0", "").isdigit():

475 map_col_id_to_id_df[str(idx_from_data).replace(".0", "")] = idx_df

476 else:

477 print("ERROR should fail, wrong data will be saved ! : " + str(idx_from_data))

478# if str(idx_df).isdigit():

479# map_col_id_to_id_df[str(idx_df)] = idx_df

480 if str(id_line).replace(".0", "") in map_col_id_to_id_df:

481 id_line_df = map_col_id_to_id_df[str(id_line).replace(".0", "")]

482 #df.iloc[id_line_df][col_name] = manual_value

483 if col_name == "datet" and ("20" not in str(manual_value) and "19" not in str(manual_value)):

484 print("Protect wrong date : " + str(manual_value))

485 manual_value = None

486 df.loc[id_line_df, col_name] = manual_value

487 else:

488 print(" Missing id line in df : " + str(id_line) + " in " + str(map_col_id_to_id_df.keys()))

489

490# df.iloc[int(float(id_line))][col_name] = manual_value

491 print("TO TEST")

492 df_as_md = df.to_markdown()

493 df_as_json = df.to_json()

494

495 # refacto renommer front_value et rajouter un

496 manual = bool(action["manual"]) if "manual" in action else False

497 if manual :

498 nb_manual_action_df += 1

499

500 set_sub_json(audit_info_corrected, key_for_df_as_md, df_as_md)

501 set_sub_json(audit_info_corrected, key_for_df_as_json, df_as_json)

502 # audit_info_corrected["io_exec"]["9"]["output"]["df_complet_as_markdown"] = df.to_markdown()

503 else:

504# print("col_name not in df : " + str(action["col_name"]) + " in " + str(df.columns))

505 list_action_for_df_to_correct.append(action)

506 elif type_action == "crops":

507 id_page = action["id_page"] if "id_page" in action else None

508 if id_page == None:

509 print("ERROR treated as warning id_page not in action")

510 continue

511 paragraphs[id_page] #["crops"] = action["crops"] if "crops" in action else None

512

513 if "modify" in action:

514 for modification in action["modify"]:

515 if "id" not in modification:

516 print("ERROR id of paragraph not in modification")

517 continue

518 if "x" in modification:

519 x = modification["x"]

520 paragraphs[id_page][int(id)]["x"] = x

521 if "y" in modification:

522 y = modification["y"]

523 paragraphs[id_page][int(id)]["y"] = y

524 if "w" in modification:

525 w = modification["w"]

526 paragraphs[id_page][int(id)]["w"] = w

527 if "h" in modification:

528 h = modification["h"]

529 paragraphs[id_page][int(id)]["h"] = h

530 if "text" in modification:

531 text = modification["text"]

532 paragraphs[id_page][int(id)]["text"] = text

533

534 if "delete" in action:

535 for id in action["delete"]:

536 if not str(id).isdigit():

537 print("Eror due to deleted crop that have no id")

538 continue

539 if int(id) < len(paragraphs[id_page]):

540 paragraphs[id_page][int(id)] = {}

541# del paragraphs[id_page][int(id)]

542 else:

543 print("ERROR id of paragraph not in modification")

544 if "add" in action:

545 for one_new_par in action["add"]:

546 new_id = len(paragraphs[id_page])

547 one_new_par["id"] = new_id

548 paragraphs[id_page].append(one_new_par)

549 #if id not in paragraphs[id_page]:

550 # paragraphs[id_page][id] = {}

551 #else:

552 # print("ERROR id of paragraph not in modification")

553 else:

554 print("type action not supported : " + str(type_action))

555 except Exception as e:

556 print(str(e))

557 print("Error treated as warning (to be audited) in modify_audit_info_from_manual_correction")

558 audit_info_corrected = audit_info

559# list_action_for_df_to_correct = []

560

561

562

563 results, audit_info_from_datou = parse_audit_info(audit_info_corrected, list_action_for_df_to_correct, map_modif_df_manual)

564 nb_manual_action_df_for_col_audit = audit_info_from_datou["nb_manual_action_df_for_col_audit"] if "nb_manual_action_df_for_col_audit" in audit_info_from_datou else -4

565 map_modif_df_manual = audit_info_from_datou["map_modif_df_manual"] if "map_modif_df_manual" in audit_info_from_datou else {}

566 audit_info_count = {"nb_modif_class_manual" : nb_modif_class_manual, "nb_manual_action_df" : nb_manual_action_df,

567 "nb_manual_action_df_for_col_audit" : nb_manual_action_df_for_col_audit,

568 "map_modif_df_manual" : map_modif_df_manual,

569 "map_count_modif_per_doc" : map_count_modif_per_doc,

570 "map_modif_type_document" : map_modif_type_document}

571 results["paragraphs"] = paragraphs

572 return df_auto, results, audit_info_count

573

574

575

576def load_audit_info_and_apply_manual_correction(hash_id_treatment_auto = None,

577 hash_id_treatment_manual = None,

578 lpgss = None,

579 limit = None,

580 project_id = None,

581 ids_manual = None,

582 manual_action_to_audit_data = {}):

583 import json

584 df_auto = None

585 all_result = lpgss.load_data_audit(hash_id_treatment=hash_id_treatment_manual, col_csv="*")

586 # print(" all result : " + str(all_result))

587 if type(all_result) == type(None):

588 print(" all_result seems none from hash_id_treatment : " + str(hash_id_treatment_manual))

589 all_result = {}

590 print(" all_result : " + str(all_result.keys()))

591 info_date = all_result["info_date"] if "info_date" in all_result else {}

592 info_date["test_var_info_date"] = "We want to find in which hit this data is recorded, either hit_main or cons_hit"

593 audit_json_file_content = all_result["audit_info"] if "audit_info" in all_result else {}

594 id_file = all_result["id_file"] if "id_file" in all_result else None

595 try:

596 audit_json_file_content_as_json = json.loads(audit_json_file_content)

597 except Exception as e:

598 audit_json_file_content_as_json = audit_json_file_content

599 print(

600 str("A présent c'est sur le chemin critique, je pense qu'un ajax règlerait le pb, mais je ne comprends pas"))

601 print(str(e))

602

603 try:

604 from lib.manaudit.lib_datou_audit import parse_audit_info, modify_audit_info_from_manual_correction

605 if limit != None and int(limit) == 0:

606 all_results = []

607 else:

608 all_results = lpgss.load_data_manual(ids_manual=ids_manual,

609 hash_id_treatment=hash_id_treatment_manual,

610 limit=limit)

611 if len(all_results) == 0:

612 results, audit_info_count = parse_audit_info(audit_json_file_content)

613 nb_manual_action_df_for_col_audit = audit_info_count[

614 "nb_manual_action_df_for_col_audit"] if "nb_manual_action_df_for_col_audit" in audit_info_count else -3

615# audit_info_count = {}

616 df_auto_as_json = None

617 print("There are no correction, to be tested")

618 df_auto = results["df"].copy() # deep = True

619 else:

620 df_auto, results, audit_info_count = modify_audit_info_from_manual_correction(

621 audit_json_file_content_as_json,

622 all_results,

623 manual_action_to_audit_data=manual_action_to_audit_data,

624 project_id=project_id)

625 except Exception as e:

626 results = {"error": str(e)}

627 audit_info_count = {}

628 print(str(e))

629 print("LOG_TO_PARSE : Error parsing audit result json manual : " + str(hash_id_treatment_manual) + " auto " + str(hash_id_treatment_auto))

630

631 hash_id_treatment_rerun = all_result["info_date"]["consolidate_hash_id_treatment"] if "info_date" in all_result and "consolidate_hash_id_treatment" in all_result["info_date"] else None

632 df_cons = all_result["info_consolidate"]["df_cons"] if "info_consolidate" in all_result and "df_cons" in all_result["info_consolidate"] else None

633

634 return df_auto, df_cons, hash_id_treatment_rerun, results, audit_info_count, id_file, audit_json_file_content_as_json, all_results, info_date

635

636def get_list_backup(histo_folder, mtd_id = None):

637 # Dans le dossier histo_folder on veut parser des fichiers json avec des noms du type : datou_anon_42_0213_22.json ou 42 est le datou_id et celui la c'est celui de 22h le 31 février, il faudrait avoir l'année de manière optionnel 2024 par defaut

638 # On veut aussi pouvoir filtrer par mtd_id mais c'est moins important

639 # On veut aussi pouvoir trier par date pour afficher dans le front

640 import os

641 import json

642 map_datou_id_date_backup = {}

643 map_proj_id_date_backup = {}

644 if histo_folder == None:

645 return map_datou_id_date_backup, map_proj_id_date_backup

646 for filename in os.listdir(histo_folder):

647 if filename.endswith(".json"):

648 print("Open " + str(filename))

649 with open(os.path.join(histo_folder, filename)) as f:

650 try:

651 data = json.load(f)

652 except Exception as e:

653 print(" Error reading " + filename + " : " + str(e))

654 continue

655

656 if "project_id" in data:

657 map_proj_id_date_backup[data["project_id"]] = data

658 continue

659

660 if type(data) == list :

661 if len(data) == 1:

662 data = data[0]

663 else :

664 print("Maybe warnind size of data in " + filename + " : " + str(len(data)) + " avoiding this file ")

665

666 if "id" not in data: # to manage the case of the intricate export

667 if "datou" not in data or len(data["datou"]) == 0:

668 print("Unexpected data in " + filename + " : " + str(data) + " avoiding this file ")

669 continue

670

671 data = data["datou"][0]

672

673 if mtd_id != None:

674 if "id" in data and int(data["id"]) != int(mtd_id):

675 continue

676 if "id" in data and data["id"] not in map_datou_id_date_backup:

677 map_datou_id_date_backup[data["id"]] = {}

678

679 print(" filename : " + str(filename))

680

681 import re

682 # exemple de nom de fichier pour parser la date : datou_anon_42_0213_22.json (et vérifier l'id du datou)

683

684 suffix = ""

685 try:

686 date_parsed = datetime.datetime.now()

687 m = re.search(r"_(\d+)_([\s\w\W-]*_)?(\d{4})?(\d{2})(\d{2})_(\d{2})", filename)

688

689 if m:

690 if len(m.groups()) == 5 and m.group(2) != None and m.group(2) != "":

691 print(" PArsed 5 groups ! " + str(m.groups()))# a tester avec le truc ligne 92

692 date_parsed = datetime.datetime(int(m.group(2)), int(m.group(3)), int(m.group(4)),

693 int(m.group(5)))

694 elif len(m.groups()) == 5:

695 print(" PArsed 4 real groups ! " + str(m.groups()))

696 default_year = 2024

697 month = int(m.group(3))

698 day = int(m.group(4))

699 hour = int(m.group(5))

700

701 date_parsed = datetime.datetime(default_year, month, day, hour)

702 elif len(m.groups()) == 6:

703 print(" Parsed 6 groups ! " + str(m.groups()))

704 default_year = 2024 if (m.group(3) == None or m.group(3) != "") else int(m.group(3))

705 month = int(m.group(4))

706 day = int(m.group(5))

707 hour = int(m.group(6))

708 suffix = m.group(2)

709 if suffix == None:

710 suffix = ""

711 mtd_id_parsed = int(m.group(1))

712

713 date_parsed = datetime.datetime(default_year, month, day, hour)

714 elif len(m.groups()) == 4:

715 print(" FAll boack not managed with 4 groups, WTF does it mean ? " + str(m.groups()) + " filename : " + filename)

716 continue

717 else:

718 print("Could not parse date from filename : " + filename)

719 continue

720 else:

721 print("Could not parse date from filename : " + filename)

722 except Exception as e:

723 print(str(e))

724 print(

725 "parsing name of backup didn't work, we may have wrong convnetion between export and voila continue : " + filename)

726 continue

727

728 from lib.lib_util import humanize_modified_time

729 date_parsed_h = humanize_modified_time(date_parsed.replace(tzinfo=None))

730 data["saved_at"] = date_parsed

731 data["saved_at_h"] = date_parsed_h + " : " + str(suffix).rstrip("_") #filename # date_parsed.strftime("%y%m%d_%H")

732

733# del data["data_str"]

734 map_datou_id_date_backup[data["id"]][date_parsed] = data

735

736 for mtd_id in map_datou_id_date_backup:

737 map_datou_id_date_backup[mtd_id] = dict(sorted(map_datou_id_date_backup[mtd_id].items(), reverse=True))

738

739 map_datou_id_date_h_backup = {}

740 for mtr_id in map_datou_id_date_backup:

741 map_datou_id_date_h_backup[mtr_id] = {}

742 for date in map_datou_id_date_backup[mtr_id]:

743 data = map_datou_id_date_backup[mtr_id][date]

744 map_datou_id_date_h_backup[mtr_id][data["saved_at_h"]] = data

745

746 return map_datou_id_date_h_backup, map_proj_id_date_backup

747

748# --job=saxia.stat_quali --limit=200 -v --project_id=134

749# --job=saxia.stat_quali --limit=200 -v --project_id=122

750

751def study_qualite_2024(lpgss = None,

752 type_doc = "document_type",

753 verbose = False,

754 condition_query = "TODO",

755 limit = 100,

756 project_id = None):

757

758 if lpgss == None:

759 print("Missing DB connector")

760 return None

761

762 from lib.lib_util import count_and_display_elapsed_time

763 import time

764 begin_time = time.time()

765

766 try :

767 auto_res = lpgss.load_auto_val(type_doc, verbose, limit, project_id = project_id)

768 except Exception as e:

769 print("Error loading auto_val : " + str(e))

770 import pandas as pd

771 auto_res = pd.DataFrame(columns=["hit", "id_row", "auto_val", "id_file", "id", "hit_id_row"])

772

773 auto_res.to_csv("auto_res.csv", sep='\t')

774 begin_time, message = count_and_display_elapsed_time(begin_time, "load_auto_val " + type_doc)

775

776 manual_res = lpgss.load_manual_correct_val(type_doc, verbose, limit, project_id = project_id)

777 manual_res.to_csv("manual_res.csv", sep='\t')

778 begin_time, message = count_and_display_elapsed_time(begin_time, "load_manual_correct_val " + type_doc)

779

780 all_df_cons = lpgss.load_df_cons(verbose, limit, project_id = project_id)

781 begin_time, message = count_and_display_elapsed_time(begin_time, "load_df_cons ")

782 big_df = None

783 import pandas as pd

784 for data in all_df_cons:

785 one_df = pd.DataFrame(data["df_cons"])

786 one_df["id_doc"] = 0

787 one_df["id_mde"] = 0

788 id_file = data["id_file"]

789 id_mde = data["id"] if "id" in data else None

790 one_df["id_file" ] = id_file

791 for idx, row in one_df.iterrows():

792 if row["id"] == "NaN" or row["id"] == "" or not str(row["id"]).isdigit():

793 print("Warning : id is NaN or empty in one_df, skipping this row")

794 continue

795 one_df.at[idx, "id"] = 1000 * id_mde + int(row["id"])

796 one_df.at[idx, "id_mde"] = id_mde

797 one_df.at[idx, "id_doc"] = int(row["id"])

798 str_id = id_file + "_l_" + str(row["id"])

799 # one_df.at[idx, "id"] = str_id

800

801 if big_df is None:

802 big_df = one_df

803 else:

804 big_df = pd.concat([big_df, one_df], ignore_index=True)

805

806 big_df.to_csv('big_df.csv', sep='\t')

807 begin_time, message = count_and_display_elapsed_time(begin_time, "build big_df ")

808

809

810 # build key

811 key = "hit_id_row"

812 for idx in auto_res.index:

813 data = auto_res.loc[idx]

814 auto_res.loc[idx, key] = str(data["hit"]) + "_" + str(data["id_row"])

815 for idx in manual_res.index:

816 data = manual_res.loc[idx]

817 manual_res.loc[idx, key] = str(data["hit"]) + "_" + str(data["id_row"])

818 # set manual_val in auto_res

819

820 map_count_key_value = {}

821 total = 0

822

823 for idx in auto_res.index:

824 data = auto_res.loc[idx]

825 key_val = data["hit_id_row"]

826

827 total += 1

828 try:

829# if True:#key in manual_res.index:

830 idxs = manual_res.loc[manual_res['hit_id_row'] == key_val].index

831 if len(idxs) == 1:

832 auto_res.loc[idx, "manual_val"] = manual_res.loc[idxs[0]]["manual_val"]

833 val = manual_res.loc[idxs[0]]["manual_val"]

834 else:

835 if len(idxs) > 1:

836 print(" Not treated while unexpected multiple value")

837 auto_res.loc[idx, "manual_val"] = data["auto_val"]

838 val = data["auto_val"]

839

840 if val not in map_count_key_value:

841 map_count_key_value[val] = 0

842 map_count_key_value[val] += 1

843 except Exception as e:

844 print(str(e))

845 auto_res.loc[idx, "manual_val"] = data["auto_val"]

846# print("missing key " + key + " in auto_res")

847 # set manual_val as auto_res in missing

848

849 map_count_key_value_ordered = dict(sorted(map_count_key_value.items(), key=lambda item: item[1], reverse=True))

850

851 print(" total : " + str(total))

852 print(" map_count_key_value_ordered : " + str(map_count_key_value_ordered))

853 count_quantile_90 = 0

854 count_nb_to_keep = 0

855 for key in map_count_key_value_ordered:

856 count_quantile_90 += map_count_key_value_ordered[key]

857 count_nb_to_keep += 1

858 if count_quantile_90 > 0.9 * total:

859 break

860

861 print(" Kept count_nb_to_keep over : " + str(count_nb_to_keep) + " overs " + str(len(map_count_key_value_ordered)))

862

863

864 if type_doc == "document_type":

865 list_key_value_doc_type = ["cr_operation","cr_exam","cr_hospit","cr_urgence","courrier","facture_utile","facture","ordonnance","certif_blessure","certif_at","attestation","certif_medical","certif_hospitalisation","autre","ordonnance_medicament_exception","cr_pompier","facture_inutile"]

866 else:

867 list_key_value_doc_type = list(map_count_key_value_ordered.keys())

868 # [:count_nb_to_keep]

869 print(" list_key_value_doc_type : " + str(list_key_value_doc_type))

870# type_doc

871 # create dataframe with these list as columns and rows

872 import pandas as pd

873# df = pd.DataFrame(columns=list_key_value_doc_type)

874

875 # Now compute confusion matrix

876 associated_labels = {}

877 predicted_labels = {}

878 list_associated_labels = list_key_value_doc_type + ["Unknown Label"]

879 list_predicted_labels = list_key_value_doc_type + ["Unknown Label"]

880 list_data_on_sen_fout = []

881 map_list_error = {}

882 idx_voila = 0

883 for idx in auto_res.index:

884 data = auto_res.loc[idx]

885 manual_val = data["manual_val"]

886 auto_val = data["auto_val"]

887 idx_voila += 1 # data["hit_id_row"]

888 associated_labels[idx_voila] = manual_val

889 predicted_labels[idx_voila] = auto_val

890 if auto_val != manual_val:

891 key_diff = str(auto_val) + "P" + str(manual_val)

892 if key_diff not in map_list_error:

893 map_list_error[key_diff] = []

894

895 map_list_error[key_diff].append((data["hit"], data["id_row"]))

896

897 if type_doc == "document_type":

898

899 from pyfvs.lib.advanced.lib_confusion_matrix import compute_confusion_matrix

900 mat_conf = compute_confusion_matrix(associated_labels, predicted_labels,

901 list_associated_labels, list_predicted_labels, list_data_on_sen_fout = [])

902 print("mat_conf : " + str(mat_conf))

903 print(mat_conf)

904 with open("mat_conf_" + type_doc + ".html", "w") as f:

905 f.write(str(mat_conf.to_html()))

906 print(" mat_conf.txt written")

907

908 print(" Error type_doc : " + type_doc + " ")

909 for d in list_associated_labels:

910 for p in list_predicted_labels:

911 if mat_conf.at[p, d] > 0:

912 key_list_diff = str(p) + "P" + str(d)

913 if key_list_diff in map_list_error:

914 print(" mat_conf.at[" + d + "," + p + "] : " + str(mat_conf.at[p, d]))

915 for data in map_list_error[key_list_diff]:

916 print(" <a href='https://safia.app/manax?hash_id_treatment=" + str(data[0]) + "&id_line=" + str(data[1]) + "'> LINE " + str(data[1]) + " " + str(p) + " to " + str(d) + " </a>")

917 # print("mat_conf.at[" + d + "," + p + "] : " + str(mat_conf.at[p, d]))

918 # for data in auto_res:

919 # if data["auto_val"] == p and data["manual_val"] == d:

920 # print("data : " + str(data))

921 else:

922 print(" \n".join(list(map_list_error.keys())))

923

924 print("TO CHECK WIP 15-8-24")

925

926# --job=saxia.stat_quali --limit=200 -v --project_id=94 --in_file=condition_intro_doc,only_correct_prediag,condition_only_one_page

927

928def study_qualite(lpgss = None,

929 type_doc = "document_type",

930 verbose = False,

931 condition_query = "TODO",

932 limit = 100,

933 project_id = None,

934 condition_intro_doc = True,

935 only_correct_prediag = True,

936 condition_only_one_page = True,

937 list_to_study = "document_type,medecin_nom,medecin_specialite,datet",

938 with_out_folder = False,

939 prepare_data_set = False):

940

941 if lpgss == None:

942 print("Missing DB connector")

943 return None

944

945 list_dataset = []

946

947 map_list_input_by_document = {}

948 list_complete = []

949 if condition_intro_doc == True:

950 if project_id == None:

951 load_conf_from_project_id = 91

952 else:

953 load_conf_from_project_id = project_id

954

955 from lib.lib_safia_system import LibSafiaSystem

956 lss = LibSafiaSystem(lib_user_data_internal=lpgss)

957 raw_configuration = lss.load_conf_project(load_conf_from_project_id)

958 from lib.util.lib_formal_conf import formal_conf_prepare

959 configuration = formal_conf_prepare(raw_configuration, lss)

960 intro_format_intro = load_sub_json(configuration, "saxia/format/info_format_intro/format/intro")

961

962 from lib.lib_util import compute_list_input_to_format_per_document

963 map_list_input_by_document = compute_list_input_to_format_per_document(intro_format_intro)

964 # On a aussi besoin de la liste complète des types de document, donc on veut faire une concatenation des listes qui sont les valeurs du amp_list_input_by_document

965 list_complete = []

966 for key in map_list_input_by_document:

967 list_complete += map_list_input_by_document[key]

968 list_complete = list(set(list_complete))

969

970 from lib.lib_util import count_and_display_elapsed_time

971 import time

972 begin_time = time.time()

973

974 all_df_cons = lpgss.load_df_cons(limit = limit, project_id = project_id, verbose=verbose, with_out_folder=False) #with_out_folder)

975 list_mhit = []

976 map_mhit_outfolder = {}

977 for data in all_df_cons:

978 if "mhit" in data and data["mhit"] != None:

979 list_mhit.append(data["mhit"])

980 if with_out_folder:

981 list_mhit_out_folder = lpgss.load_output_folder_from_mhit(list_mhit, verbose=verbose)

982 for r in list_mhit_out_folder:

983 if "mhit" in r and "out_folder" in r:

984 if r["out_folder"] != None and r["mhit"] != None :

985 map_mhit_outfolder[r["mhit"]] = r["out_folder"] + "/" + r["mhit"]

986 else:

987 print("Warning : missing out_folder for mhit " + str(r))

988 begin_time, message = count_and_display_elapsed_time(begin_time, "load_df_cons ")

989 big_df = None

990 import pandas as pd

991 map_id_file_count_nan = {}

992

993 for data in all_df_cons:

994 one_df = pd.DataFrame(data["df_cons"])

995 one_df["id_doc"] = 0

996 one_df["id_mde"] = 0

997 one_df["image_path"] = ""

998 id_file = data["id_file"]

999 out_folder = data["out_folder"] if "out_folder" in data else ""

1000 mhit = data["mhit"] if "mhit" in data else None

1001 if mhit != None and mhit in map_mhit_outfolder:

1002 out_folder = map_mhit_outfolder[mhit]

1003 id_mde = data["id"] if "id" in data else None

1004 one_df["id_file" ] = id_file

1005 map_id_file_count_nan[id_file] = 0

1006 for idx, row in one_df.iterrows():

1007 if row["id"] == "NaN" or row["id"] == "" or not str(row["id"]).isdigit():

1008 if row["Liste des pages"] != "":

1009 map_id_file_count_nan[id_file] += 1

1010# print("Warning : id is NaN or empty in one_df, skipping this row : " + str(row))

1011 continue

1012 one_df.at[idx, "id"] = 1000 * id_mde + int(row["id"])

1013 one_df.at[idx, "id_mde"] = id_mde

1014 one_df.at[idx, "id_doc"] = int(row["id"])

1015 one_df.at[idx, "image_path"] = out_folder + "/page_" + str(row["Liste des pages"]) + ".png" if out_folder != "" else ""

1016 str_id = str(id_file) + "_l_" + str(row["id"])

1017 # one_df.at[idx, "id"] = str_id

1018

1019 if big_df is None:

1020 big_df = one_df

1021 else:

1022 big_df = pd.concat([big_df, one_df], ignore_index=True)

1023

1024 print("map_id_file_count_nan[id_file] " + str(map_id_file_count_nan[id_file]) + " rows with problem and non null Liste des Pages")

1025

1026 print(len(big_df), " rows in big_df")

1027

1028 # On ne va garder que les lignes qui ont "document_type" et "Liste des pages" non vides

1029 if "document_type" in big_df.columns and "Liste des pages" in big_df.columns:

1030 big_df = big_df[big_df["document_type"].notna() & big_df["Liste des pages"].notna()]

1031 print(len(big_df), " rows in big_df")

1032 big_df = big_df[big_df["document_type"] != '']

1033 print(len(big_df), " rows in big_df")

1034 big_df = big_df[big_df["Liste des pages"] != '']

1035

1036 print(len(big_df), " rows in big_df")

1037

1038 big_df["nb_word_cr"] = -1

1039 big_df["nb_word_quantile"] = -1

1040 map_list_quantile = {5:"0_5",20:"5_20",50:"20_50",150:"50_150",1000:"150_1000",10000:"1000_infini"}

1041 for idx in big_df.index:

1042 data = big_df.loc[idx]

1043 if "cr_correct_typo" in data and not pd.isna(data["cr_correct_typo"]):

1044 nb_word_cr = len(str(data["cr_correct"]).split())

1045 big_df.at[idx, "nb_word_cr"] = nb_word_cr

1046 for q in map_list_quantile:

1047 if nb_word_cr < q:

1048 break

1049 big_df.at[idx, "nb_word_quantile"] = map_list_quantile[q]

1050

1051 big_df.to_csv('big_df.csv', sep='\t')

1052 begin_time, message = count_and_display_elapsed_time(begin_time, "build big_df ")

1053

1054 print("big_df : " + str(big_df.head(10)))

1055 big_df = big_df.reset_index(drop=True)

1056# big_df["ERROR"] = 'COL'

1057

1058 # Initialisation d'un dataframe avec les memes colonnes que big_df

1059 df_to_study = pd.DataFrame(columns=big_df.columns)

1060 # Rajout de la première ligne

1061 #df_to_study = pd.concat([df_to_study, big_df.iloc[0:0]], ignore_index=True)

1062

1063 list_one_dim_distribution = ["document_type", "medecin_specialite",

1064 "document_type_auto", "medecin_specialite_auto",

1065 "Nombre de pages", "prediag", "Liste des pages", "nb_word_quantile"]

1066 for c in list_one_dim_distribution:

1067 if c not in big_df.columns:

1068 print("Column " + c + " not in big_df, skipping")

1069 continue

1070 print("Processing column STUDY ONE DIM " + c)

1071 map_val_nb = {}

1072 count_total = 0

1073 count_empty = 0

1074 count_not_empty = 0

1075

1076 for idx in big_df.index:

1077 data = big_df.loc[idx]

1078 count_total += 1

1079 val = data[c] if c in big_df.columns else None

1080 if pd.isna(val) or val == "":

1081 count_empty += 1

1082 else:

1083 count_not_empty += 1

1084 if type(val) == list:

1085 print("ERROR: value is a list, converting to string for counting: " + str(val))

1086 val = str(val)

1087 if val not in map_val_nb:

1088 map_val_nb[val] = 0

1089 map_val_nb[val] += 1

1090 # Display distribution

1091 print("Distribution for column " + c + ":")

1092 # Order by nb decreasing

1093 map_val_nb = dict(sorted(map_val_nb.items(), key=lambda item: item[1], reverse=True))

1094 for val in map_val_nb:

1095 if count_total == 0:

1096 print(f"{val}: {map_val_nb[val]} (0.00%)")

1097 else:

1098 print(f"{val}: {map_val_nb[val]} ({round(map_val_nb[val] / count_total * 100, 2)}%)")

1099

1100 # Affiche les résultats

1101 print("Column " + c + " :")

1102 print("Total rows: " + str(count_total))

1103 if count_total == 0:

1104 print("Empty auto: 0 (0.00%)")

1105 print("Not empty auto: 0 (0.00%)")

1106 else:

1107 print("Empty auto: " + str(count_empty) + " (" + str(round(count_empty / count_total * 100, 2)) + "%)")

1108 print("Not empty auto: " + str(count_not_empty) + " (" + str(round(count_not_empty / count_total * 100, 2)) + "%)")

1109 print("")

1110

1111 # On va maintenant faire un tableau de différentes paires de colonnes

1112 list_paires = [("document_type", "prediag"), ("document_type", "nb_word_quantile"), ("prediag", "nb_word_quantile")]

1113 for (col1, col2) in list_paires:

1114 if col1 not in big_df.columns or col2 not in big_df.columns:

1115 print("Column " + col1 + " or " + col2 + " not in big_df, skipping")

1116 continue

1117 print("Processing column pair STUDY TWO DIM " + col1 + " and " + col2)

1118 map_val_nb = {}

1119 map_val1_nb = {}

1120 count_total = 0

1121 count_empty = 0

1122 count_not_empty = 0

1123

1124 for idx in big_df.index:

1125 data = big_df.loc[idx]

1126 count_total += 1

1127 val1 = data[col1]

1128 val2 = data[col2]

1129 if pd.isna(val1) or val1 == "" or pd.isna(val2) or val2 == "":

1130 count_empty += 1

1131 else:

1132 count_not_empty += 1

1133 if str(val1) not in map_val_nb:

1134 map_val_nb[str(val1)] = {}

1135 map_val1_nb[str(val1)] = 0

1136 if str(val2) not in map_val_nb[str(val1)]:

1137 map_val_nb[str(val1)][str(val2)] = 0

1138 key = str(val1) + "_" + str(val2)

1139

1140 map_val_nb[str(val1)][str(val2)] += 1

1141 map_val1_nb[str(val1)] += 1

1142

1143 # Display distribution

1144 print("Distribution for columns " + col1 + " and " + col2 + ":")

1145 # Order by nb decreasing

1146# map_val_nb = dict(sorted(map_val_nb.items(), key=lambda item: item[1], reverse=True))

1147 for k1 in map_val_nb:

1148 print(f"{k1}: {map_val1_nb[k1]} ({round(map_val1_nb[k1] / count_total * 100, 2)}%) ")

1149 for k2 in map_val_nb[k1]:

1150 import sys

1151 sys.stdout.write(f"{k2} : {map_val_nb[k1][k2]} ({round(map_val_nb[k1][k2] / map_val1_nb[k1] * 100, 2)}%) ")

1152 print("")

1153

1154 # Affiche les résultats

1155 print("Columns " + col1 + " and " + col2 + ":")

1156 print("Total rows: " + str(count_total))

1157 print("Empty auto: " + str(count_empty) + " (" + str(round(count_empty / count_total * 100, 2)) + "%)")

1158 print("Not empty auto: " + str(count_not_empty) + " (" + str(round(count_not_empty / count_total * 100, 2)) + "%)")

1159 print("")

1160

1161

1162

1163 list_comparison = []

1164

1165 # Calcul des taux d'erreur et de correct pour chacune des colonnes

1166 # , 'Nombre de pages'

1167 list_comparison.append(("intro_correct_typo", "intro_back"))

1168 for col_ref in ["medecin_nom", "medecin_prenom", "document_type", 'Liste des pages', 'Titre',

1169 'medecin_specialite', 'nom_hopital', 'genre_service_hopital',

1170 'indication_examen', 'date_entree_hospitalisation',

1171 'date_sortie_hospitalisation', 'motif_hospitalisation',

1172 'date_fin_arret_travail', 'TitreMeta', 'datet', 'date_parsed_or_forced',

1173 'date_fin_arret_travailt', 'date_entree_hospitalisationt', 'date_sortie_hospitalisationt']:

1174 col_auto = col_ref + "_auto"

1175 col_manual = col_ref # + "_manual"

1176 list_comparison.append((col_manual, col_auto))

1177

1178 list_comparison.append(("cr_correct_typo", "cr_back"))

1179

1180 list_error_for_meta_data = []

1181 for (col_manual, col_auto) in list_comparison:

1182 if col_auto not in big_df.columns or col_manual not in big_df.columns:

1183 print("Column " + col_auto + " or " + col_manual + " not in big_df, skipping")

1184 continue

1185 list_errors = []

1186 list_commentaires_corrects = []

1187 list_commentaires_errors = []

1188 print("Processing column " + col_manual)

1189 count_ignore = 0

1190 count_total = 0

1191 count_correct = 0

1192 count_error = 0

1193 count_empty = 0

1194 count_not_empty = 0

1195 count_not_empty_auto = 0

1196 count_not_empty_manual = 0

1197 for idx in big_df.index:

1198 doc_type = big_df.loc[idx, "document_type"]

1199 comms = big_df.loc[idx, "Commentaires"]

1200 if doc_type in map_list_input_by_document:

1201 list_input = map_list_input_by_document[doc_type]

1202 if col_manual not in list_input and col_manual in list_complete and col_manual != 'document_type':

1203 # print("Ignoring column " + col_manual + " for document type " + doc_type)

1204 count_ignore += 1

1205# list_error_for_meta_data.append(idx)

1206 continue

1207 else :

1208 if condition_intro_doc:

1209 print("MISSING DATA TO ANALYZE : " + str(doc_type))

1210 pass

1211 if condition_only_one_page :

1212 #if "Nombre de page" in big_df.columns :

1213 if "Liste des pages_auto" in big_df.columns and "Liste des pages" in big_df.columns:

1214 if str(big_df.loc[idx, "Liste des pages_auto"]).strip().isdigit() and str(big_df.loc[idx, "Liste des pages"]).strip().isdigit() and int(big_df.loc[idx, "Liste des pages"]) == int(big_df.loc[idx, "Liste des pages_auto"]):

1215 pass

1216 else :

1217 if "," not in str(big_df.loc[idx, "Liste des pages_auto"]) or "," not in str(big_df.loc[idx, "Liste des pages"]):

1218 if "none" != str(big_df.loc[idx, "Liste des pages_auto"]).lower():

1219 print("UNEXPECTED LISTE DES PAGES " + str(big_df.loc[idx, "Liste des pages_auto"]) + "||" + str(big_df.loc[idx, "Liste des pages"]))

1220 else :

1221 import sys

1222 sys.stdout.write("ñø")

1223 # Plusieurs pages

1224 count_ignore += 1

1225 list_error_for_meta_data.append(idx)

1226 continue

1227 else :

1228 print("UNEXPECTED missing Nombre de page on document_type : " + str(doc_type) + " ")

1229 # print("UNEXPECTED missing info intro on document_type : " + str(doc_type) + " ")

1230 if only_correct_prediag:

1231 if big_df.loc[idx, "prediag"] not in ("OK", "BON"):

1232 if big_df.loc[idx, "prediag"] not in ("", "AUTRE", "CERFA_MANUSCRIT", "MANUSCRIT", "TABLEAU", "MAUVAIS", "PRESQUEBON"):

1233 print("UNEXPECTED " + big_df.loc[idx, "prediag"])

1234 count_ignore += 1

1235 list_error_for_meta_data.append(idx)

1236 continue

1237 else:

1238 if big_df.loc[idx, "prediag"] == "":

1239 count_ignore += 1

1240 list_error_for_meta_data.append(idx)

1241 continue

1242 data = big_df.loc[idx]

1243 count_total += 1

1244 auto_val = data[col_auto]

1245 manual_val = data[col_manual]

1246 if pd.isna(auto_val) or auto_val == "":

1247 count_empty += 1

1248 else:

1249 count_not_empty += 1

1250 count_not_empty_auto += 1

1251 if type(manual_val) == list or pd.isna(manual_val) or manual_val == "":

1252 continue

1253 count_not_empty_manual += 1

1254 id_file = data["id_file"]

1255 lp_id = data["Liste des pages"]

1256 mde_id = int(data["id"]/ 1000) if type(data["id"]) != types.NoneType else ""

1257 if str(auto_val).lower() == str(manual_val).lower():

1258 count_correct += 1

1259 list_commentaires_corrects.append(comms)

1260 else:

1261 count_error += 1

1262 list_error_for_meta_data.append(idx)

1263 if idx in [13, 19, 25, 28, 29, 30, 31, 32, 34, 38, 41, 42, 43, 55]:

1264 print("Unexpected wrond meta data but perfect cr_back !")

1265 comms_start = comms[:32] if type(comms) == str else ""

1266 list_errors.append((mde_id, lp_id, manual_val, auto_val, comms_start))

1267 list_commentaires_errors.append(comms)

1268 if condition_intro_doc and doc_type in map_list_input_by_document and col_manual in map_list_input_by_document[doc_type]:

1269 if idx not in list_error_for_meta_data:

1270 list_error_for_meta_data.append(idx)

1271 intro_correct = big_df.loc[idx, "intro_correct_typo"]

1272 if not col_manual.startswith("date") and manual_val.lower() not in intro_correct.lower():

1273 print("Error in " + col_manual + " for " + doc_type + " : " + str(manual_val) + " not in " + str(intro_correct) + " for idx : " + str(idx))

1274 print("A ANALYSER")

1275 if col_manual in list_to_study:

1276 try:

1277# big_df.iloc[idx, "ERROR"] = col_manual

1278 # On va ajouter la ligne dans le dataframe df_to_study

1279 big_df.loc[idx, "Commentaires"] = "DIFFERENCE in " + col_manual + " : " + comms

1280 one_more_line = big_df.iloc[idx:idx + 1]

1281 df_to_study = pd.concat([df_to_study, one_more_line], ignore_index=True)

1282 print("Padam")

1283 except Exception as e:

1284 print("Error adding row to df_to_study for idx " + str(idx) + ": " + str(e))

1285 continue

1286

1287

1288 # Affiche les résultats

1289 print("Column " + col_manual + " :")

1290 print("Ignored rows: " + str(count_ignore))

1291 print("Total rows: " + str(count_total))

1292 if count_total == 0:

1293 print("Empty auto: 0 (0.00%)")

1294 print("Not empty auto: 0 (0.00%)")

1295 else:

1296 print("Correct: " + str(count_correct) + " (" + str(round(count_correct / count_total * 100, 2)) + "%)")

1297 print("Errors: " + str(count_error) + " (" + str(round(count_error / count_total * 100, 2)) + "%)")

1298 print("Empty auto: " + str(count_empty) + " (" + str(round(count_empty / count_total * 100, 2)) + "%)")

1299 print("Not empty auto: " + str(count_not_empty_auto) + " (" + str(round(count_not_empty_auto / count_total * 100, 2)) + "%)")

1300 print("Not empty manual: " + str(count_not_empty_manual) + " (" + str(round(count_not_empty_manual / count_total * 100, 2)) + "%)")

1301 print("Listes de paires d'erreurs")

1302 print(str(list_errors))

1303 print("Listes de commentaires dans les cas d'erreurs ")

1304 print(str(list_commentaires_errors))

1305 print("Listes de commentaires dans les cas corrects ")

1306 print(str(list_commentaires_corrects))

1307 print("")

1308

1309 list_complete =list(big_df.index) # list(range(0, len(big_df)))

1310 list_total_correct = [] #list(big_df.index)

1311 for idx in list_complete:

1312 if idx not in list_error_for_meta_data:

1313 list_total_correct.append(idx)

1314

1315 print("Total correct : " + str(len(list_total_correct)))

1316 print(str(list_total_correct))

1317

1318 if condition_intro_doc:

1319 list_all_correct = list(big_df.index[big_df["intro_correct_typo"] == big_df["intro_back"]]) + list_total_correct

1320 for idx in list_all_correct:

1321 print("idx : " + str(idx) + " intro_correct_typo : " + str(big_df.loc[idx, "id"]))

1322

1323 print(big_df.index[big_df["intro_correct_typo"] == big_df["intro_back"]])

1324 print(list_total_correct)

1325 for idx in list_all_correct:

1326 try :

1327 data = big_df.loc[idx]

1328 doc_type = data["document_type"]

1329 list_needed_column = map_list_input_by_document.get(doc_type, [])

1330 input_as_json = {}

1331 for f in list_needed_column:

1332 if f in data and not pd.isna(data[f]) and data[f] != "":

1333 if f.startswith("date"):

1334 import datetime

1335 data_used = datetime.datetime.strptime(data[f][:10], "%Y-%m-%d")

1336 else :

1337 data_used = data[f]

1338 input_as_json[f] = data_used

1339 else:

1340 input_as_json[f] = None

1341 from lib.lib_util import format_one_res

1342 if doc_type in intro_format_intro:

1343 intro_from_manual_saved = format_one_res(input_as_json, intro_format_intro[doc_type], format_premier=False, format_date="%d %B %Y")

1344 else:

1345 print("Missing intro_format_intro for doc_type : " + str(doc_type))

1346 intro_from_manual_saved = ""

1347 print(" idx : " + str(idx))

1348 print(" id : " + str(data["id"]))

1349 print(" intro_from_manual_saved :" + str(intro_from_manual_saved))

1350 print(" intro_correct_typo : " + str(data["intro_correct_typo"]))

1351 print(" intro_back : " + str(data["intro_back"]))

1352 except Exception as e:

1353 print("Error processing idx " + str(idx) + ": " + str(e))

1354 continue

1355

1356 if prepare_data_set:

1357 for idx, data in big_df.iterrows():

1358 doc_type = data["document_type"]

1359 if doc_type in map_list_input_by_document:

1360 list_needed_column = map_list_input_by_document[doc_type]

1361 else:

1362 print("Missing doc_type in map_list_input_by_document : " + str(doc_type))

1363 continue

1364 one_data_set = {}

1365 if "image_path" not in data:

1366 print("Missing image_path, did you use with_out_folder ? " + str(data) + " for doc_type : " + str(doc_type) + ", skipping this row.")

1367 continue

1368 # image_path

1369 one_data_set["image_path"] = data["image_path"] if "image_path" in data else ""

1370 json_extract = {}

1371 for c in list_needed_column:

1372 if c in data and not pd.isna(data[c]) and data[c] != "":

1373 if c.startswith("date"):

1374 try:

1375 import datetime

1376 json_extract[c] = datetime.datetime.strptime(data[c][:10], "%Y-%m-%d")

1377 except Exception as e:

1378 print("Error parsing date for " + c + ": " + str(e))

1379 json_extract[c] = "None"

1380 else:

1381 json_extract[c] = data[c]

1382 else:

1383 json_extract[c] = None

1384 print("Missing data !")

1385 continue

1386 one_data_set["extract_meta_data"] = json_extract

1387 list_dataset.append(one_data_set)

1388

1389 import datetime

1390 jour_suffix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

1391 list_dataset_out_file = "dataset_" + jour_suffix + ".json"

1392 with open(list_dataset_out_file, "w") as f:

1393 import json

1394 # Convert list_dataset to JSON format

1395 # Ensure that datetime objects are converted to strings

1396

1397 #f.write(str(list_dataset))

1398 f.write(json.dumps(list_dataset, default=str, indent=4))

1399

1400 df_data = pd.DataFrame(list_dataset, columns=["image_path", "extract_meta_data"])

1401 list_dataset_out_file = "dataset_as_df_" + jour_suffix + ".csv"

1402 df_data.to_csv(list_dataset_out_file, sep='\t', index=False)

1403

1404 from lib.lib_ml.lib_util_prepare_dataset import from_csv_create_json_dataset

1405 from_csv_create_json_dataset(list_dataset_out_file, crop=False,

1406 server_root="https://safia.app",

1407 folder_root="",

1408 download_or_get_local_file=False, # used only for crop !

1409 col_url_path="image_path",

1410 col_text="extract_meta_data",

1411 sep='\t')

1412

1413

1414 df_to_study = df_to_study.reset_index(drop=True)

1415 return df_to_study

1416

1417

Coverage for lib/manaudit/lib_datou_audit.py: 4%

1042 statements