Coverage for lib/datou/lib_datou_step_template.py: 62%

1310 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2026-02-10 01:10 +0100

1import datetime 

2import os.path 

3import shutil 

4import types 

5 

6import numpy as np 

7 

8from lib.brick_layers.lib_abstract_generic_layer import LayerGeneric, LayerPrompt 

9 

10# TODO ARCHI VR 14-6-23 : est-ce qu'on ferait une classe pour avoir les services de cost_estimation 

11from auth.lib_cost import CostEstimation as CE 

12from uuid import uuid4 

13 

14# speech_to_text 

15def datou_safia_step_speech_to_text(input : dict, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict: 

16 list_input = ["file"] # TODO VR REFACTO ARCHI : vérifier les entrées et sorties 

17 list_output = ["text"] 

18 list_param_json = ["openai_token"] # TODO VR REFACTO : avoir une interface d'édition ou bien un context d'ou on récupèrera ces infos (mais ce context dépendra potentiellement du projet pour spécifier le type de modèle, ou d'une "instanciation" d'un datou) 

19 

20 file = input["file"] 

21 openai_token = param_json["openai_token"] 

22 

23 parse_prefix_file = bool(param_json["parse_prefix_file"]) if "parse_prefix_file" in param_json else False 

24 

25 # Hard-coded param 

26 language = None 

27 

28 from lib.lib_speechtotext import speech_to_text 

29 text = "" 

30 length_time_seconds = 0 

31 if file.endswith(".amr") or file.endswith(".ogg") or file.endswith(".mp4") or file.endswith(".webm"): 

32 print("import convert_file") 

33 from lib.lib_speechtotext import convert_file 

34 print("calling convert_file") 

35 new_file = convert_file(file) 

36 print("calling speech_to_text") 

37 text, length_time_seconds, model = speech_to_text(new_file, openai_token, language=language, verbose=verbose) 

38# logger.info("Is length_time_seconds null ? " + str(length_time_seconds)) 

39 

40 # TODO VR REFACTO : je n'aime pas ces deux appels dupliquer 

41 elif file.endswith(".mp3") or file.endswith(".m4a") or file.endswith(".wav") : 

42 import os 

43 size = os.path.getsize(file) 

44 print(" size : " + str(size)) 

45 

46 if size > 10000000: # pragma no cover scale 

47 print(" size : " + str(size)) 

48 from lib.lib_speechtotext import convert_file 

49 print("calling convert_file") 

50 new_file = convert_file(file) 

51 size = os.path.getsize(new_file) 

52 print(" size : " + str(size)) 

53 

54 from lib.lib_speechtotext import split_mp3 

55 nb_split = 1 + int(size / 10000000) 

56 list_files = split_mp3(new_file, nb_split, verbose=verbose) 

57 

58 text = "" 

59 length_time_seconds = 0 

60 model = "" 

61 for file_aux in list_files: 

62 text_aux, length_time_seconds_aux, model = speech_to_text(file_aux, openai_token, language=language, verbose=verbose) 

63 text += text_aux 

64 length_time_seconds += length_time_seconds_aux 

65 else : 

66 text, length_time_seconds, model = speech_to_text(file, openai_token, language=language, verbose=verbose) 

67 

68 ce.compute_cost_search(model, length_time_seconds) 

69 

70 # TODO VR REFACTO : il faut aussi effacer les fichiers 

71 

72 output = {"text" : text} 

73 

74 if "preprompt" in input: 

75 output["preprompt"] = input["preprompt"] 

76 

77 return output 

78 

79 

80 

81def sub_func_read_ocr(f, count, model, verbose, map_file_size, map_file_text, folder_export_boxes, 

82 begin_page=False, end_page=False, file_output="output", 

83 layer_api = None, vllm_model = None, 

84 request_used = None): 

85 

86 from lib.lib_util import SubDocPage 

87 from lib.lib_ocr import img_to_texte, ocr_google_vision, gcp_doc_ai 

88 # VR TODO 9-5-25 Cette boucle existe en double et n'est donc testé qu'une fois, mais on ne sait pas laquelle 

89 if model == "tesseract": 

90 text, list_boxes, maxx, maxy, list_blocks = img_to_texte(f, verbose) 

91 elif model == "prompt": 

92 print("Prompt ! ") 

93 list_boxes = [] 

94 maxx = 0 

95 maxy = 0 

96 list_blocks = {} 

97 

98# text = "TODO !" 

99 if verbose: 

100 print("before call request_gpt") 

101 try : 

102 if layer_api == None: 

103 (result, nb_token, modele) = ("", 0, "") 

104 else : 

105 text, nb_token, modele = layer_api.prompt(request_used = request_used, gpt_model = vllm_model, 

106 verbose = verbose, 

107 images = [f]) 

108 except Exception as e: 

109 print(str(e)) 

110 text, nb_token, modele = "", 0, "ERROR IN PROMPT" 

111 

112 

113 elif model == "gcp_doc_ai": 

114 if os.stat(f).st_size > 20000000: 

115 print(" Expecting failure due to too big file : " + str(f) + " " + str(os.stat(f).st_size)) 

116 else: 

117 print(" os.stat(f).st_size : " + str(os.stat(f).st_size)) 

118 try: 

119 text, list_boxes, maxx, maxy, list_blocks = gcp_doc_ai(f, verbose=verbose) 

120 except Exception as e: 

121 print("ERROR TREATED AS WARNING THANKS RECUPERATION : OCR gcp_doc_ao FAILED on " + str( 

122 f) + " We wil try the old one ! too bad if it is a cerfa ") 

123 print(str(e)) 

124 text = "" 

125 list_boxes = [] 

126 maxx = 0 

127 maxy = 0 

128 list_blocks = [] 

129 text, list_boxes, maxx, maxy, list_blocks = ocr_google_vision(f, verbose) 

130 else: # google_ocr 

131 text, list_boxes, maxx, maxy, list_blocks = ocr_google_vision(f, verbose) 

132 if folder_export_boxes != "": 

133 if not os.path.exists(folder_export_boxes): 

134 os.makedirs(folder_export_boxes) 

135 with open(folder_export_boxes + "/" + file_output + "_" + str(count) + ".json", "w") as of: 

136 import json 

137 of.write(json.dumps(list_boxes)) 

138 map_file_size[f] = len(text) 

139 map_file_text[f] = text 

140 sdp = SubDocPage(count, text, f, list_boxes, maxx, maxy, list_blocks) 

141 

142 return count, text, sdp 

143 

144# image_to_text 

145def datou_safia_step_image_to_text(input : dict, param_json : dict = {}, ce : CE = None, 

146 verbose : bool = False, layer_api : LayerGeneric = None) -> dict: 

147 list_input = ["file"] # TODO VR REFACTO ARCHI : vérifier les entrées et sorties 

148 list_output = ["text", "preprompt"] # TODO VR REFACTO : ce preprompt devrait etre fait ailleurs 

149 list_param_json = ["google_token", "model"] # TODO VR REFACTO : avoir une interface d'édition ou bien un context d'ou on récupèrera ces infos (mais ce context dépendra potentiellement du projet pour spécifier le type de modèle, ou d'une "instanciation" d'un datou) 

150 

151 model = param_json["model"] if "model" in param_json else "google_ocr" 

152 file = input["file"] 

153 google_token = param_json["google_token"] if 'google_token' in param_json else None 

154 dpi = param_json["dpi"] if "dpi" in param_json else 72 

155 if google_token == None: 

156 print(" Will crash or not !") 

157 

158 parse_prefix_file = bool(param_json["parse_prefix_file"]) if "parse_prefix_file" in param_json else True 

159 print(" parse_prefix_file : " + str(parse_prefix_file)) 

160 parse_date_test_before_own_datou_step = bool(param_json["parse_date_test_before_own_datou_step"]) if "parse_date_test_before_own_datou_step" in param_json else False 

161 hash_id_treatment = input["hash_id_treatment"] if "hash_id_treatment" in input else None 

162 only_count = bool(param_json["only_count"]) if "only_count" in param_json else False 

163 only_extract_page = bool(param_json["only_extract_page"]) if "only_extract_page" in param_json else False 

164 

165 # when multiple file in input as a raw split ! 

166 multi_input = input["multi_input"] if "multi_input" in input else False 

167 saxia_split_end_csv = input["saxia_split_end_csv"].replace(" ", "") if "saxia_split_end_csv" in input else "" 

168 saxia_all_doc_separated = input["saxia_all_doc_separated"] if "saxia_all_doc_separated" in input else False 

169 use_split_complet = param_json["use_split_complet"] if "use_split_complet" in param_json else False 

170 if saxia_all_doc_separated and (saxia_split_end_csv != "") and use_split_complet: 

171 print("Here we want to not split with prompt, maybe we should use multi_input and if there is only file w ecould also activate this case ! !") 

172 create_output_hit = bool(param_json["create_output_hit"]) if "create_output_hit" in param_json else False 

173 

174 if type(file) == list: 

175 if len(file) == 0: 

176 print("ERROR treated as WARNING : No Input file !, we can quit or not, it shouldn't matter ") 

177 

178 # temporary 

179# file = file[0] 

180 

181 print(" ERROR treated as WARNING : only the first file will be treated : " + str(file)) 

182 print("TODO multiple files not implemented yet !") 

183 if multi_input: 

184 print("VR TODO 24/6/24 wip multi_input => is in fact working !") 

185 else : 

186 print(" We have not decided yet the default behavior VR TODO CDC 24/6/24 : for now this is an internal error to be in this situation") 

187 one_file_reference = file[0] 

188 else: 

189 one_file_reference = file 

190 file = [file] 

191 

192 if one_file_reference == None: 

193 print(" ERROR treated as WARNING : No Input file reference !, we can quit or not, it shouldn't matter ") 

194 size_file = os.stat(one_file_reference).st_size 

195 created_at = datetime.datetime.fromtimestamp(os.stat(one_file_reference).st_ctime) 

196 in_folder = os.path.dirname(one_file_reference) 

197 work_folder_images = os.path.dirname(one_file_reference) 

198 

199 if one_file_reference.lower().endswith(".pdf"): 

200 from lib.lib_util import from_pdf_to_list_pngs 

201 list_pngs, count_per_doc, list_of_list_of_pages = from_pdf_to_list_pngs(file, dpi = dpi, hash_id_treatment = hash_id_treatment, only_count = only_count) 

202 else : 

203 if multi_input: 

204 print("Internal error as of 24/6/24, behavior to be developped CDC TODO VR 24/6/24") 

205 list_pngs = [one_file_reference] 

206 list_of_list_of_pages = [[1]] 

207 

208 if len(list_pngs) == 0: 

209 print("TO activate after some test !") 

210# saxia_all_doc_separated = True 

211 

212 if saxia_all_doc_separated: 

213 print(" We should avoid doing split with prompt and treat all different case !") 

214 

215 from lib.lib_util import parse_id_date_nb_page_folder 

216 json_prefix_file = {} 

217 if parse_prefix_file : 

218 date_input = input["date"] if "date" in input else param_json["date"] if "date" in param_json else None 

219 nb, id, date = parse_id_date_nb_page_folder(one_file_reference) 

220 if id == 0: 

221 id = param_json["id"] if "id" in param_json else 0 

222 if nb == 0: 

223 nb = len(list_pngs) 

224 print(" date " + str(date) + " and id " + str(id) + " and nb " + str(nb) + " and file " + str(one_file_reference) + " and input keys " + str(input.keys())) 

225 if date == None: 

226 print("Using date_input as date " + str(date_input) + " and date " + str(date) + " and id " + str(id) + " and nb " + str(nb) + " and file " + str(one_file_reference) + " and input keys " + str(input.keys())) 

227 if date_input == None: 

228 date = datetime.datetime.now().strftime("%Y%m%d") 

229 else : 

230 

231 if type(date_input) == str: 

232 from lib.lib_util import parse_date 

233 date, parsed_or_forced = parse_date(date_input, settings=None) 

234 date = date.strftime("%Y%m%d") 

235 else: 

236 date = date_input.strftime("%Y%m%d") 

237 json_prefix_file = {"nb" : nb, 

238 "date" : date, 

239 "id" : id} 

240 filename_at = json_prefix_file["date"] if "date" in json_prefix_file else datetime.datetime.now().strftime("%Y%m%d 00:00:00") 

241 

242 from lib.lib_util import create_prefix_file_name_from_json_prefix 

243 prefix_file = create_prefix_file_name_from_json_prefix(json_prefix_file) 

244 

245 print("keyword_to_parse_for_suivi_and_crash_id_file : " + str(prefix_file)) 

246 print("keyword_to_parse_for_suivi_and_crash_hit : " + str(hash_id_treatment)) 

247 

248 if only_extract_page: 

249 output = {"files" : list_pngs, "nb_page" : len(list_pngs)} 

250 elif only_count: 

251 output = {} 

252 else : 

253 map_file_size = {} 

254 map_file_text = {} 

255 

256 begin_page = bool(param_json['begin_page']) if 'begin_page' in param_json else None 

257 end_page = bool(param_json['end_page']) if 'end_page' in param_json else None 

258 limit = param_json["limit"] if "limit" in param_json else 0 

259 parallel = bool(param_json["parallel"]) if "parallel" in param_json else False 

260 nb_thread = param_json["nb_thread"] if "nb_thread" in param_json else 10 

261 

262 folder_export_boxes = param_json['folder_export_boxes'] if 'folder_export_boxes' in param_json else "" 

263 

264 begin_page_txt = "" 

265 end_page_txt = "" 

266 

267 

268 complete_text = "" 

269 list_page_content = [] 

270 list_page_content_text = [] 

271 file_output = prefix_file + "_" + input["hash_id_treatment"] if "hash_id_treatment" in input else prefix_file + "_" + str(uuid4()) 

272 

273 request_used = input["preprompt"] if "preprompt" in input else None 

274 vllm_model = param_json["vllm_model"] if "vllm_model" in param_json else None #"mistral-small3.1" 

275 

276 print(str(list_pngs)) 

277 print(" verbose : " + str(verbose)) 

278 print("About to parallel or not") 

279 if parallel and len(list_pngs) > nb_thread: 

280 print("WARNING : not implemented yet for parallel and more than nb_thread images") 

281 

282 if parallel and len(list_pngs) <= nb_thread: 

283 from lib.datou.lib_parallel import multi_thread_image_read 

284 map_pids_path, map_sdp, map_text = multi_thread_image_read(model, verbose, map_file_size, map_file_text, 

285 folder_export_boxes, begin_page, end_page, file_output, 

286 nb_thread=nb_thread, list_pngs=list_pngs, 

287 layer_api = None, vllm_model = vllm_model) 

288 

289 for i in range(len(list_pngs)): 

290 nb = i + 1 

291 sdp = map_sdp[nb] 

292 list_page_content.append(sdp) 

293 list_page_content_text.append(sdp.content) 

294 

295 print(" Inside parallel ! ") 

296 print(" map_text.keys() " + str(map_text.keys())) 

297 else: 

298 map_text = {} 

299 count = 1 

300 for f in list_pngs: 

301 if limit > 0 and count > limit: 

302 break 

303 count, text, sdp = sub_func_read_ocr(f, count, model, verbose, map_file_size, map_file_text, folder_export_boxes, 

304 begin_page, end_page, file_output, layer_api=layer_api, 

305 vllm_model=vllm_model, request_used = request_used) 

306 

307 map_text[count] = text 

308 list_page_content.append(sdp) 

309 list_page_content_text.append(sdp.content) 

310 complete_text += begin_page_txt + text + end_page_txt 

311 count = count + 1 

312 print("Outside parallel") 

313 print(" map_text.keys() " + str(map_text.keys())) 

314 

315 print(" map_text.keys() " + str(map_text.keys())) 

316 for page in map_text: 

317 print(" size : " + str(len(map_text[page]))) 

318 

319 if saxia_split_end_csv != "": 

320 print(" list_of_list_pages should be just [range(1, len(list_pngs))]") 

321 from lib.lib_util import build_list_of_list_from_split 

322 list_of_list_of_pages = build_list_of_list_from_split(saxia_split_end_csv, len(list_pngs)) 

323 

324 from lib.lib_util import create_transcript_group_of_pages 

325 complete_texts = create_transcript_group_of_pages(list_of_list_of_pages, map_text) 

326 if not multi_input and saxia_split_end_csv == "": 

327 print(" We should have only one group of page here !") 

328 if len(complete_texts) != 1: 

329 print(" WARNING data will be ignored !") 

330 complete_text = complete_texts[0] 

331 

332 # The next if and else has now been refactored in above (how do I check) TODO 11/9/24 

333 

334 # VR TODO refacto to be merge with the other leg of the condition VR 11/9/24 

335 # VR TODO refacto TESTED IN ONE CASE, TO REMOVE ON 15/10/2024 

336 # and remove count_per_doc 

337 # if multi_input: 

338 # complete_texts = [] 

339 # cum_id_page = count_per_doc[0] 

340 # id_part = 0 

341 # one_complete_text = "" 

342 # for i in range(len(list_pngs)): 

343 # nb = i + 1 

344 # text = map_text[nb] 

345 # if begin_page: 

346 # begin_page_txt = "\n------\nBegin Page " + str(nb) + "\n------\n" 

347 # if end_page: 

348 # end_page_txt = "\n------\nEnd Page " + str(nb) + "\n------\n" 

349 # one_complete_text += begin_page_txt + text + end_page_txt 

350 # 

351 # ## toute la logique suivante servait à ne pas avoir de list_of_list_of_pages, donc on va s'en passer à présent 

352 # if i + 1 == cum_id_page: 

353 # complete_texts.append(one_complete_text) 

354 # one_complete_text = "" 

355 # id_part += 1 

356 # if id_part < len(count_per_doc): 

357 # cum_id_page += count_per_doc[id_part] 

358 # else: 

359 # if i + 1 != len(list_pngs): 

360 # print("Internal error count_per_doc") 

361 # else: 

362 # print("This is the end !") 

363 # elif saxia_split_end_csv == "" : 

364 # complete_text = "" 

365 # # for i in range(len(list_pngs)): 

366 # # nb = i + 1 

367 # if len(list_of_list_of_pages) != 1: 

368 # print("INTERNAL ERROR WHILE REFACTORING ! ") 

369 # for nb in list_of_list_of_pages[0]: 

370 # text = map_text[nb] 

371 # if begin_page: 

372 # begin_page_txt = "\n------\nBegin Page " + str(nb) + "\n------\n" 

373 # if end_page: 

374 # end_page_txt = "\n------\nEnd Page " + str(nb) + "\n------\n" 

375 # complete_text += begin_page_txt + text + end_page_txt 

376 

377 

378 # TODO VR 5-4-25 : this is for auto split : not used yet 

379 if parse_date_test_before_own_datou_step: 

380 from lib.lib_util import parse_date_test_before_own_datou_step 

381 map_res_page_date = parse_date_test_before_own_datou_step(list_page_content) 

382 print("TO USE and TEST or use when failing in load_tab") 

383 else : 

384 map_res_page_date = {} 

385 

386 # prefix_prompt_input = "Merci d'estimer une approximation basique de l'impact carbone des produits se trouvant ici, ainsi que leur nombre de calories et le prix si possible, meme de manière approximative et de la renvoyer sous forme de tableau avec pour colonnes : PRODUIT, CO2, CALORIES, PRIX :\n" 

387 

388 # VR TODO pas trop content de cela et non pas du tout 21-1-24 

389 # remove defautl step hard-coded, can tolerate either in input from interface or with the defaut datou for jpg in input 

390 prefix_prompt_input = input["preprompt"] if "preprompt" in input else "" #"Merci d'estimer l'impact carbone des produits se trouvant ici, ainsi que leur nombre de calories et le prix si possible, meme de manière approximative, ou incomplet ou que tu fasses un raisonnement ouvert pour estimer tu mettras n/c quand tu ne peux pas estimer et de la renvoyer sous forme de tableau avec pour colonnes : PRODUIT, CO2, CALORIES, PRIX :\n" 

391 # TODO VR REFACTO : il faut aussi effacer les fichiers ou ailleurs 

392 

393 print("NIMP si estimer prefix_prompt_input : " + str(prefix_prompt_input)[:100]) 

394 

395 ce.compute_cost_search("google_ocr", len(map_file_text)) 

396 

397 data = [ 

398 { 

399 "id": file_output, 

400 "text": "\n".join(complete_texts) 

401 } 

402 ] 

403 

404 if multi_input or saxia_split_end_csv != "": 

405 complete_text = complete_texts 

406 

407 print("begin_page complete_text : " + str(complete_text[:100]).replace("\n", "§§")) 

408 

409 # Si le preprompt est vide on pourrait aussi ne pas le mettre, mais la step prompt_gpt gère quand meme ce cas là, donc voila ! 

410 output = {"text" : complete_text, "preprompt" : prefix_prompt_input, 

411 "json_to_save" : data, 

412 "list_page_content" : list_page_content, 

413 "list_page_content_text" : list_page_content_text, 

414 # "map_file_size" : map_file_size, "map_file_text" : map_file_text, 

415 "images": [f for f in list_pngs], 

416 "paragraphs" : [p.list_blocks["paragraphs"] if "paragraphs" in p.list_blocks else [] for p in list_page_content], 

417 "in_folder" : in_folder, 

418 "work_folder_images" : work_folder_images, 

419 "map_res_page_date" : map_res_page_date} 

420 

421 if saxia_all_doc_separated and use_split_complet: 

422 print("TO TEST") 

423 output["multi_input"] = True 

424 output["text_only_for_meta_data_and_not_split"] = output["text"] 

425 output["text"] = [] 

426 

427 if parse_prefix_file: 

428 output["prefix_file"] = json_prefix_file 

429 output["id_file"] = prefix_file 

430 output["nb_page"] = len(list_pngs) 

431 output["filename_at"] = filename_at 

432 

433 output["input_file_available_at"] = created_at 

434 output["size_file"] = size_file 

435 

436 if create_output_hit: 

437 output["output_hit"] = prefix_file + "_" + input["hash_id_treatment"] if "hash_id_treatment" in input else prefix_file + "_" + str(uuid4()) 

438 

439 return output 

440 

441 

442 

443def datou_safia_step_request_gpt(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerPrompt = None) -> dict : 

444 list_input = ["preprompt", "text"] # TODO VR REFACTO ARCHI : vérifier les entrées et sorties 

445 list_output = ["result", "request"] # TODO VR REFACTO : ce preprompt devrait etre fait ailleurs 

446 list_param_json = ["openai_token", "gpt_model"] # TODO VR REFACTO : avoir une interface d'édition ou bien un context d'ou on récupèrera ces infos (mais ce context dépendra potentiellement du projet pour spécifier le type de modèle, ou d'une "instanciation" d'un datou) 

447 

448 if verbose : 

449 print("Inside request gpt") 

450 

451 if "preprompt" in input and input["preprompt"] != "": 

452 preprompt = input["preprompt"] 

453 elif "preprompt" in param_json: 

454 preprompt = param_json["preprompt"] 

455 else : 

456 preprompt = "" 

457 print(" all keys input : " + str(input.keys())) 

458 text = input["text"] if "text" in input else "" 

459 multi_input = input["multi_input"] if "multi_input" in input else False 

460 saxia_split_end_csv = input["saxia_split_end_csv"].replace(" ", "") if "saxia_split_end_csv" in input else "" 

461 

462 exec_if_true = param_json["exec_if_true"] if "exec_if_true" in param_json else True 

463 if not exec_if_true or exec_if_true == {}: 

464 print(" dont_exec_if_false is True, we skip the formatting step ") 

465 return input 

466 print("PAssed exec if true prompt step") 

467 

468 if type(text) == list: 

469 # text = text[0] 

470 print(" ERROR treated as WARNING : only the first text will be treated : " + str(text)) 

471 print("TODO multiple files not implemented yet !") 

472 if multi_input or saxia_split_end_csv != "": 

473 print(" Here we need to do something !") 

474 texts = text 

475 else : 

476 print("As of 24/6/24 internal error") 

477 else: 

478 texts = [text] 

479 

480 model = "" 

481 size_correct = True 

482 nb_token = 0 

483 result = "" 

484 request = "" 

485 if len(texts) == 0: 

486 print("List empty of texts as input prompt !") 

487 results = [] 

488 for text in texts: 

489 print(" begin text begin_page " + str(text[:50].replace("\n", " "))) 

490 request = preprompt + text 

491 

492 # TODO a virer car a ete injecter à la configuration 

493 openai_token = param_json["openai_token"] 

494 gpt_model = param_json["gpt_model"] if "gpt_model" in param_json else "gpt-4" 

495 

496 from lib.lib_util import check_and_truncate_query_max_token 

497 size_correct, request_truncated = check_and_truncate_query_max_token(request) 

498 

499 request_used = request if size_correct else request_truncated 

500 

501 if verbose: 

502 print("before call request_gpt") 

503 try : 

504 if layer_api == None: 

505 (result, nb_token, model) = ("", 0, "") 

506 else : 

507 result, nb_token, model = layer_api.prompt(request_used, gpt_model, verbose = verbose) 

508 except Exception as e: 

509 print(str(e)) 

510 result, nb_token, model = "", 0, "ERROR IN PROMPT" 

511 

512 results.append(result) 

513 

514# from lib.lib_openai import request_gpt 

515# result, nb_token, model = request_gpt(openai_token, request_used, gpt_model, verbose = verbose) 

516 if verbose : 

517 print("after request_gpt") 

518 ce.compute_cost_search(model, nb_token) 

519 if not size_correct: 

520 print("WARNING TOO LONG QUERY ") 

521 result = "Your query was too long and has been truncated :" + result 

522 

523 if multi_input or saxia_split_end_csv != "": 

524 result = results 

525 

526 return {"result" : result, "request" : request} 

527 

528 

529 

530# send_mail 

531def datou_safia_step_send_mail(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict : 

532 list_input = ["request", "result", "file"] 

533 list_output = ["result", "object"] 

534 list_param_json = ["info_auth", "hash_id_treatment", "privacy", "from_mail_to_send"] 

535 

536 privacy = param_json["privacy"] if "privacy" in param_json else False 

537 info_auth = param_json["info_auth"] if "info_auth" in param_json else None 

538 from_mail_to_send = param_json["from_mail_to_send"] if "from_mail_to_send" in param_json else None 

539 privacy = param_json["privacy"] if "privacy" in param_json else False 

540 hash_id_treatment = param_json["hash_id_treatment"] if "hash_id_treatment" in param_json else input["hash_id_treatment"] if "hash_id_treatment" in input else None 

541 send_mail = param_json["send_mail"] if "send_mail" in param_json else True 

542 send_sms = param_json["send_sms"] if "send_sms" in param_json else False 

543 send_slack = param_json["send_slack"] if "send_slack" in param_json else False 

544 type_email = param_json["type_email"] if "type_email" in param_json else "plain_text" 

545 

546 # VR TODO : il faut sans doute rajouter un override (on peut utiliser assoc pour cela), mais à l'upload de fichier audio on s'attend à des comportement par defaut 

547 result = param_json["result"] if "result" in param_json else input["result"] if "result" in input else "" 

548 request = input["request"] if "request" in input else "" 

549 file = input["file"] if "file" in input else "" 

550 object = input["object"] if "object" in input else param_json["object"] if "object" in param_json else "Prompt request by email to Fotonower assistant APIA" 

551 

552 if type(object) != str: 

553 print("object has not been converted to string, we will do it !") 

554 object = str(object) 

555 

556 from auth.lib_conf_system import collect_version_from_datou_and_proj_and_app_recursively 

557 version = input["version"] if "version" in input else collect_version_from_datou_and_proj_and_app_recursively() 

558 

559 if privacy : 

560 privacy_footer = """ 

561 Privacy is ON, RGPD is strictly implemented and no data sent will be kept outside your email to address you issue and keep a record of your usage, please find more info here https://www.fotonower.com/fpa 

562 Used hash is : 

563 """ + str(hash_id_treatment) 

564 else : 

565 privacy_footer = """ 

566 Privacy is OFF, You can OPT-OUT by sending an email to dpo@fotonower.com with object : OPT-OUT FPA (Fotonower Prompt Assistant) : """ + str(hash_id_treatment) + "<br>\n" 

567 

568 version_footer = "Generated with Safia " + version + "<br>\n" + \ 

569 "More info on https://safia.app or https://saxia.app " 

570 

571 import logging 

572 logger = logging.getLogger() 

573 logger.info("before send_mail test GITVELOURS in os.environ ") 

574 logger.info("before get_info_auth ") 

575 if type_email == "html": 

576 html = result 

577 else: 

578 html = str(info_auth) + str(result) + "<br><br><br>\n" + privacy_footer + "\n" + version_footer 

579 content_txt = str(info_auth) + "\n" + str(result) + "\n<br>\n<br><br><br>\n" + privacy_footer + "\n" + version_footer 

580 

581 # Step send mail => bien qu'on l'ai indiqué la dessous 

582 from lib.lib_speechtotext import remove_extension 

583 txt_file = remove_extension(file) + ".txt" 

584 logger.info("After remove_extension ") 

585 import os, sys 

586 logger.info(os.getcwd()) 

587 logger.info("to write txt_file " + str(txt_file)) 

588 if file != "": 

589 with open(txt_file, "w") as f: 

590 f.write(request) 

591 else : 

592 shutil.rmtree(txt_file, ignore_errors=True) 

593 

594 logger.info("After write ") 

595 

596 if send_mail: 

597 if "GITVELOURS" in os.environ : 

598 logger.info("YES for GITVELOURS : " + str(os.environ["GITVELOURS"])) 

599 pythonpathfotonower = os.path.join(os.environ["GITVELOURS"], "python") 

600 sys.path.append(pythonpathfotonower) 

601 logger.info("before import ses mailer ! ") 

602 import mtr.ses_mailer 

603 ses_mailer = mtr.ses_mailer.SesMailer() 

604 

605 logger.info("before get_from_mail_to_send ! ") 

606 dest_mail_list = from_mail_to_send 

607 

608 logger.info("logger About to send email : " + str(dest_mail_list)) 

609 

610 if "cc" in input: 

611 dest_mail_list += "," + input["cc"] 

612 

613 print ("About to send email !") 

614 if verbose : 

615 print (" type html : " + str(type(html))) 

616 print ("html : " + str(html)) 

617 

618 sender = "assistant@fotonower.com" 

619 try : 

620 print(" Maybe type_email is useless 28/12/25 ") 

621 print(" Sending " + sender + " to " + str(dest_mail_list)) 

622 if file != "": 

623 ret = ses_mailer.send_email_with_attachment(sender, dest_mail_list, 

624 object, body_html = html, file_path = txt_file, body_text = content_txt) 

625 else : 

626 html = result 

627 ret = ses_mailer.send_html_email(sender, dest_mail_list, object, html, content_txt) 

628 except Exception as e: 

629 print(str(e)) 

630 

631 if send_sms: 

632 print("TODO send_sms") 

633 if send_slack: 

634 print("TODO send_slack") 

635 

636 return {"object" : object, "body" : result} # , "version" : version 

637 

638# git_action 

639def datou_safia_step_git_action(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict : 

640 list_input = ["request", "result"] 

641 list_output = [] 

642 list_param_json = ["defaut_github_issue", "github_token", "privacy"] 

643 

644 privacy = param_json["privacy"] 

645 defaut_github_issue = param_json["defaut_github_issue"] 

646 github_token = param_json["github_token"] 

647 

648 request = input["request"] 

649 result = input["result"] 

650 

651 if privacy: 

652 print("With privacy enabled, we do not append info to any github issues so we continue !") 

653 else : 

654 own_repo_nb = defaut_github_issue.split("/") 

655 if github_token != "" and len(own_repo_nb) == 3 and own_repo_nb[2].isdigit(): 

656 own_repo = own_repo_nb[0] + "/" + own_repo_nb[1] 

657 issue_number = int(own_repo_nb[2]) 

658 from lib.lib_github import append_comment 

659 message_comment_github = "[up](#up)\n\n" + result + "\n<br>\n----\nMESSAGE BRUT\n------<details>\n\n" + request + "\n</details>" 

660 append_comment(github_token, verbose = verbose, 

661 message_comment = message_comment_github, 

662 OwnRepo = own_repo, 

663 issue_number = issue_number) 

664 print("github message appened !") 

665 

666 output = {"log_git_action" : "git_action_done"} 

667 return output 

668 

669 

670 

671# doc_to_json TO TEST 

672def datou_safia_step_doc_to_json(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict : 

673 list_input = ["file"] 

674 list_output = ["json_path", "log_d2j"] 

675 list_param_json = [] # managed_extension (for freemium ?) 

676 

677 file = input["file"] 

678 

679 # TODO VR : get list from somewhere els 

680 managed_extension = param_json["managed_extension"].split("") if "managed_extension" in param_json else [".mp3", ".ogg", ".amr", ".m4u", ".wav", ".jpeg", ".jpg", ".png", ".pdf", ".txt", ".docx", ".json", ".py"] 

681 

682 from lib.lib_safia import safia_import 

683 json_to_import_path, list_detailed_time_safia_import_to_json, sorted_dict_unmanaged_extension = safia_import(in_file=file, # managed_extension=managed_extension, 

684 verbose = False) 

685 

686 output = {"json_path" : json_to_import_path, "log_d2j" : list_detailed_time_safia_import_to_json, "unmanaged_extension" : str(sorted_dict_unmanaged_extension)} 

687 return output 

688 

689 

690 

691# import_json TO TEST 

692def datou_safia_step_import_json(input : dict = {}, param_json : dict = {}, ce : CE = None, 

693 verbose : bool = False, layer_api : LayerGeneric = None) -> dict : 

694 list_input = ["json_path", "json_to_save"] # one of them 

695 list_output = ["log_import"] 

696 list_param_json = ["table_documents", "openai_token"] 

697 

698 json_path = input["json_path"] if "json_path" in input else "" 

699 json_to_save = input["json_to_save"] if "json_to_save" in input else [] 

700 

701 table_documents = param_json["table_documents"] if "table_documents" in param_json else "" 

702 openai_token = param_json["openai_token"] if "openai_token" in param_json else "" 

703 

704 from lib.import_util.lib_import_retrieval.scripts.process_json.process_json import process_json_dump, process_json_dump_aux 

705 import asyncio 

706 

707 # logger.info(" before process_json_dump : json_to_import_path : " + str(json_path)) 

708 try : 

709 from server.safia import lpgss_singleton # VR to refacto with abstract classes ? 

710 lpgss_singleton.get_admin_situation(verbose=verbose) 

711 if json_path != "": # TODO better test existence ?? 

712 total_nb_token, used_model = asyncio.run(process_json_dump(json_path, {}, False, False, None, lpgss_singleton, openai_token, table_documents, verbose = verbose)) 

713 else : 

714 total_nb_token, used_model = asyncio.run(process_json_dump_aux(json_to_save, {}, False, False, None, lpgss_singleton, openai_token, table_documents, verbose = verbose)) 

715 # result_json["log"] += " ,after process json to documents table : total_nb_token : " + str(total_nb_token) 

716 

717 

718 

719 except Exception as e: 

720 import logging 

721 logger = logging.getLogger() 

722 logger.info(str(e)) 

723 print(str(e)) 

724 logger.info("Bug in datou_safia_step_import_json") 

725 print("Bug in datou_safia_step_import_json") 

726 used_model = "crashed" 

727 total_nb_token = -1 

728 

729 ce.compute_cost_search(used_model, total_nb_token) 

730 

731 output = {"log_import_json" : "Inserted in file : " + str(json_path)} 

732 return output 

733 

734 

735 

736def datou_safia_step_get_embedding(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict : 

737 list_input = ["text"] 

738 list_output = ["embedding"] 

739 list_param_json = ["openai_token"] 

740 

741 text = input["text"] 

742 openai_token = param_json["openai_token"] 

743 

744 # datou_step 

745 from lib.lib_openai import get_embeddings 

746 # embedding_model fixed for now 

747 embedding_input = get_embeddings(text, openai_token, verbose = verbose) 

748 

749 info_context_exec = {"display_info" : {"embedding" : "delete", "info_context_exec" : "show"}} 

750 

751 output = {"embedding" : embedding_input, "info_context_exec" : info_context_exec} 

752 return output 

753 

754 

755 

756# search_doc_NN TO TEST 

757def datou_safia_step_append_to_doc_content(input : dict = {}, param_json : dict = {}, 

758 ce : CE = None, verbose : bool = False, 

759 layer_api : LayerGeneric = None) -> dict : 

760 list_input = ["result", "document_id", "project_id", "user_id"] # TODO VR je ne me rappele plus si les datous sont right_safe ou doivent vérifier les droits => 

761 # - [ ] TODO il faut déjà définir la terminologie 

762 list_output = ["references"] 

763 list_param_json = ["openai_token"] # TODO VR 15-6-23 : in_match_count optionnal => gérer les param_json optionnel 

764 

765 openai_token = param_json["openai_token"] 

766 

767 document_id = input["document_id"] if "document_id" in input else param_json["document_id"] if "document_id" in param_json else "" 

768 project_id = input["project_id"] if "project_id" in input else param_json["project_id"] if "project_id" in param_json else 0 

769 user_id = param_json["user_id"] if "user_id" in param_json else 0 

770 result = input["result"] if "result" in input else input["text"] if "text" in input else "" 

771 

772 table_documents = param_json["table_documents"] if "table_documents" in param_json else "dummy_table_documents_no_access" 

773 

774 # VR TODO 4-12-23 : ca va etre difficile de rajouter la récupération du project_id, car les datous semblent contextualiser à un projet (mais c'est celui par défaut) 

775 # Cependant je ne sais pas comment aller chercher lss dans un datou (je ne voulais pas cela à un moment donner, ben si, mais maintenant 

776 # je veux faire des choses varié sur les projets (d'ailleurs avoir des projets d'entrée et d'autres de sortie/enregistrement) 

777# has_access = lss.lib_right.get_role_on_project(lss.get_user_id(), project_id) 

778 from datetime import datetime 

779 today_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 

780 from server.safia import lpgss_singleton, lib_right_singleton 

781 

782 # TODO VR : est-ce bien ce que l'on veut faire ? on ne veut pas plutot récupérer le lss dans le contexte d'execution (on aurait alors le user_id) 

783 from lib.lib_safia_system import LibSafiaSystem 

784 lss = LibSafiaSystem(lib_user_data_internal=lpgss_singleton, lib_right=lib_right_singleton) 

785 lss.user_id = user_id # CA c'est un hack 

786 content = lss.load_document(document_id, project_id, chunk_id=None, verbose = verbose) 

787 

788 new_content = content + "\n" + result 

789 

790# list_docs = lpgss_singleton.get_documents(table_documents, document_id, verbose = verbose) 

791# if len(list_docs) != 1: 

792# print(" Problem with documents to append ! ") 

793 

794# content = list_docs[0]["content"] + "\n" + result # TODO VR 4-12-23 : on pourrait aussi ajouter un tag pour dire que c'est un append 

795 

796 save_document_data = {"document_id" : document_id, "document_content" : new_content} 

797 # TODO VR : a mon avis ce serait mieux d'abstraire cela (dans le layer machin et de ne pas avoir ce openai_token dans process_json ni autre !) 

798 total_nb_token, used_model = lss.save_document(save_document_data, project_id, openai_token=openai_token) 

799 ce.compute_cost_search(used_model, total_nb_token) 

800 

801 # TODO VR : à mon avis le result sera deja dedans vu qu'on append tout au fur et à mesure ! 

802 output = {"result" : result, "references" : [document_id]} 

803 return output 

804 

805# search_doc_NN TO TEST 

806def datou_safia_step_search_doc_NN(input : dict = {}, param_json : dict = {}, 

807 ce : CE = None, verbose : bool = False, 

808 layer_api : LayerGeneric = None) -> dict : 

809 list_input = ["embedding"] 

810 list_output = [] 

811 list_param_json = ["match_page_sections", "in_match_count"] # TODO VR 15-6-23 : in_match_count optionnal => gérer les param_json optionnel, VR 4-12-23 mais n devrait-on pas plutot mettre ces infos dans input ? 

812 

813 match_page_sections = param_json["match_page_sections"] 

814 in_match_count = param_json["in_match_count"] if "in_match_count" in param_json else 5 

815 embedding = input["embedding"] 

816 text = input["text"] if "text" in input else "" 

817 

818 from server.safia import lpgss_singleton 

819 

820 from lib.stockage.lib_pgvector import find_docs 

821 result = find_docs(embedding, lpgss_singleton, 

822 function = match_page_sections, 

823 in_match_count = 5, 

824 verbose = False) 

825 

826 preprompt = "Merci de repondre à la question à partir des documents et ne pas mentionné que tu es un chatbot sinon quelqu'un va mourir :" 

827 

828 request = preprompt + text + str(list(map(lambda x : x["document_id"] + " " + x["content"], result))) 

829 

830 list_document_ids = list(map(lambda x : x["id"], result)) # document_id is without the chunk id 

831 

832 print("request : " + str(request)) 

833 

834 output = {"result" : result, "request" : request, "text" : request, "references" : list_document_ids} 

835 return output 

836 

837 

838 

839# result_to_json 

840def datou_safia_step_result_to_json(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict : 

841 list_input = ["request", "result"] 

842 list_output = [] 

843 list_param_json = ["user"] 

844 

845 print("TO USE TO CREATE NEW STEP") 

846 

847 user = param_json["user"] if "user" in param_json else None 

848 if user == None: 

849 print(" This show map_reduce doesn't pass correclty the complete_param_json and that this param doesn't have the user key because the mismatch between app_param, user_param, exec_param, step_param is not well defined ! ") 

850 return {} 

851 

852 prefix = param_json["prefix"] if "prefix" in param_json else "" 

853 

854 from lib.lib_util import replace_non_alpha_with_underscore 

855 

856 if user == None: 

857 user = replace_non_alpha_with_underscore("anonymous@opio.fr") 

858 else : 

859 user = "0.0.0.0" 

860 

861 from datetime import datetime 

862 curr = datetime.now().strftime("%Y%m%d%H%M%S%f") 

863 if prefix == None: 

864 name = user + "_" + curr 

865 else : 

866 name = prefix + user + "_" + curr 

867 

868 total_nb_token = 0 

869 used_model = "" 

870 references = [] 

871 if "request" in input and "result" in input: 

872 request = input["request"] 

873 result = input["result"] 

874 

875 id_request = "///UPLOAD//REQUEST//" + name 

876 id_result = "///UPLOAD//RESULT//" + name 

877 references = [id_request, id_result] 

878 

879 # TODO VR : here we should parse title in result and set in request and result ? 

880 # TODO VR : how should hostname be configured, in context_process_server ? 

881 # TODO VR Il faut supprimer les fichiers temporaire et garder que ceux que l'on veut du cdn 

882 

883 list_reference = "\n## No references\n" 

884 if "references" in input: 

885 list_reference = "\n## References Internal and External\n" 

886 r = 0 

887 for ref in input["references"]: 

888 if ref.endswith(".ogg"): 

889 ref = ref.replace(".ogg", ".mp3") 

890 list_reference += "Ref " + str(r) + " : " + ref + "\n" 

891 r = r + 1 

892 

893 data = [ 

894 { 

895 "id" : id_request, 

896 "text" : request 

897 }, 

898 { 

899 "id" : id_result, 

900 "text" : result + list_reference 

901 } 

902 ] 

903 else : 

904 data = [] 

905 

906 output = {"json_to_save" : data, "references" : references} 

907 return output 

908 

909 

910 

911# load_graph => to debug TODO VR to debug 

912def datou_safia_step_load_existing_graph(input : dict = {}, param_json : dict = {}, 

913 ce : CE = None, verbose : bool = False) -> dict : # pragma no cover icebox 

914 list_input = [] # TODO VR ajouter file 

915 list_output = ["preprompt"] # 

916 list_param_json = [] # TODO VR REFACTO 

917 

918 print("TO define") 

919 # TODO VR 14-6 : comment construire des prompt en plusieurs bout ? 

920 # VR ca n'a pas encore marché ! 

921 #if object == "edit_graph": 

922 from lib.lib_graph import read_graph 

923 input_graph = read_graph(temp_dir = "static/temp/graph", 

924 graph_name = "graph", 

925 verbose = verbose) 

926 output = {"preprompt" : "\nEn prenant comme graph de départ celui-ci :\n" + input_graph} 

927 return output 

928 

929def datou_safia_step_load_url_content_text(input: dict = {}, 

930 param_json: dict = {}, 

931 ce: CE = None, 

932 verbose: bool = False, 

933 layer_api : LayerGeneric = None) -> dict: 

934 list_input = ["url"] 

935 list_output = ["content"] 

936 list_param_json = [] 

937 

938 import requests 

939 from bs4 import BeautifulSoup 

940 

941 # L'URL de la page dont vous voulez extraire le texte 

942 url = input["url"] if "url" in input else "https://www.fotonower.com/" 

943 

944 print("url : " + str(url)) 

945 

946 try: 

947 # Faire une requête GET à l'URL 

948 response = requests.get(url) 

949 except Exception as e: 

950 print(str(e)) 

951 return {"content": "Error in request" + str(e)} 

952 

953 # Analyser le contenu HTML de la page avec Beautiful Soup 

954 soup = BeautifulSoup(response.text, 'html.parser') 

955 

956 # Extraire tout le texte de la page 

957 texte_page = soup.get_text(separator='<br>', strip=True) 

958 texte_page = texte_page.replace("|", " ") 

959 texte_page = texte_page.replace("\n", " ") 

960 texte_page = texte_page.replace("\r", " ") 

961 

962 # Afficher le texte de la page 

963 if verbose: 

964 print(texte_page) 

965 

966 output = {"content": texte_page} 

967 return output 

968 

969def aux_map_reduce_loop(list_texts, res_json_field, aux_input_var, list_steps, list_param_json_steps, param_json, 

970 verbose, privacy, with_audit, strat_reduce, reduced_result, list_audit_map_reduce, 

971 hit_main_datou_step_map_reduce = "to_be_passed_as_argument", id_step_incomplete_args = None): 

972 for textbout in list_texts: 

973 if verbose: 

974 print("text : " + str(textbout)) 

975 print(" hit_main from map_reduce : " + hit_main_datou_step_map_reduce) 

976 from lib.datou.datou_exec import datou_exec 

977 input_datou = textbout if strat_reduce == "run_one_datou" else {aux_input_var: textbout} 

978 

979 # TODO a nettoyer : On ne force rien pour l'instant VR 15-1-25 

980 with_audit_save_var = with_audit 

981 #with_audit = False 

982 if 'with_audit' in input_datou: 

983 with_audit_save = input_datou['with_audit'] 

984 # input_datou['with_audit'] = False 

985 

986 output, audit_json = datou_exec(list_steps, input=input_datou, 

987 complete_param_json=param_json, verbose=verbose, 

988 privacy=privacy, list_param_json_steps=list_param_json_steps, 

989 with_audit=with_audit, id_step_incomplete_args = id_step_incomplete_args) 

990 

991 result = output if strat_reduce == "run_one_datou" else output[res_json_field] if res_json_field in output else "" 

992 if with_audit: 

993 if verbose: 

994 print("audit_json : " + str(audit_json)) 

995 list_audit_map_reduce.append(audit_json) 

996 

997 if 'with_audit' in input_datou: 

998 try : 

999 input_datou['with_audit'] = with_audit_save 

1000 except Exception as e: 

1001 print(" Incompréhensible que cela ne fonctionne pas") 

1002 with_audit = with_audit_save_var 

1003 

1004 # TODO TO TEST VR 26-1-24 : and add image also ?!? 

1005 # df[res_json_field] = result 

1006 

1007 if strat_reduce == "concat": 

1008 reduced_result += result 

1009 elif strat_reduce == "append_page": 

1010 print( 

1011 "Thanks object (sub_page_doc) that are reference in python, the result is already at its correct position") 

1012 elif strat_reduce == "run_one_datou": 

1013 reduced_result = result 

1014 reduced_result["hit_internal"] = "tofind" 

1015 else: 

1016 print("Unsupported : strat_reduce : " + str(strat_reduce)) 

1017 

1018 return reduced_result 

1019 

1020def datou_safia_step_map_reduce(input : dict = {}, 

1021 param_json : dict = {}, 

1022 ce : CE = None, 

1023 verbose : bool = False, 

1024 layer_api : LayerGeneric = None) -> dict : 

1025 list_input = ["text", "datou_int_id", "strat_reduce", "param.size", "param.overlap", "res_json_field"] 

1026 # TODO update sql input 

1027 list_output = [] 

1028 list_param_json = [] 

1029 with_audit = input["with_audit"] if "with_audit" in input else False 

1030 

1031# "text", "datou_int_id", "strat_reduce", "param.size", "param.overlap" 

1032 

1033 text = input["text"] if "text" in input else "" 

1034 list_page_content = input["list_page_content"] if "list_page_content" in input else [] 

1035 list_page_content_text = input["list_page_content_text"] if "list_page_content_text" in input else [] 

1036 paragraphs = input["paragraphs"] if "paragraphs" in input else [] 

1037 id_step_incomplete_args = param_json["id_step_incomplete_args"] if "id_step_incomplete_args" in param_json else None 

1038 

1039 # VR 19-5 : je ne sais plus pourquoi j'ai besoin de cherche dans le input 

1040 datou_int_id = input["datou_int_id"] if "datou_int_id" in input else param_json["datou_int_id"] if "datou_int_id" in param_json else -1 

1041 strat_reduce = param_json["strat_reduce"] if "strat_reduce" in param_json else "concat" 

1042 param = param_json["param"] if "param" in param_json else {"size" : 10000, "overlap" : 1000} 

1043 res_json_field = input["res_json_field"] if "res_json_field" in input else "result" 

1044 list_page_per_doc = input["list_page_per_doc"] if "list_page_per_doc" in input else param["list_page_per_doc"] if "list_page_per_doc" in param else 0 # VR 17-5 TODO param c'est fias pour débugguer à mon avis 

1045 

1046 parallel = bool(param_json["parallel"]) if "parallel" in param_json else False 

1047 nb_thread = param_json["nb_thread"] if "nb_thread" in param_json else 10 

1048 

1049 if list_page_per_doc == 0: 

1050 print("CHECK ERROR using default list of page not grouped by document, maybe normal if it is the first map_reduce where we try to classify : datou_int_id : " + str(datou_int_id)) 

1051 if len(list_page_content_text) != len(list_page_content): 

1052 print("ERROR migration datou_exec_partial_data_json") 

1053 list_page_per_doc = ";".join(list(map(str, range(1, len(list_page_content_text) + 1)))) 

1054 

1055 from lib.lib_util import split_text, split_text_by_doc, split_list_page_by_doc, split_list_page_by_page 

1056 aux_input_var = "text" 

1057 curr_datou_id = None 

1058 if strat_reduce == "concat": 

1059 print("concat") 

1060 

1061 # En fait c'est le texte complet (on split en BEGIN END ? GRRR) 

1062 # This case is for old version generated before 30/6/25 when we want to do partial exec for stat study ! 

1063 if list_page_content == []: 

1064 from lib.lib_util import managing_deprecated_input_text_concat_into_list 

1065 list_texts = managing_deprecated_input_text_concat_into_list(text, list_page_per_doc) 

1066 

1067 else: 

1068 if text != None and (len(list_page_per_doc) == len(text) or (len(list_page_per_doc) == 1 and type(text) == str)): 

1069 print("NEVER CORRECT OR USELESS ANYWAY We expect list_text to be the same object, we force it when length is one (ie ONE DOC) : VR 3-6-25") 

1070 # if type(text) == str: 

1071 # list_texts = [text] 

1072 

1073 try: 

1074 list_texts = split_text_by_doc(list_page_content_text, list_page_per_doc) 

1075 except Exception as e: 

1076 print("l 1038 If only one doc it could run !, and maybe alos with multiple doc by the way") 

1077 print(str(e)) 

1078 elif strat_reduce == "append_page": 

1079 list_texts = split_list_page_by_page(paragraphs) 

1080 aux_input_var = "paragraphs" 

1081 elif strat_reduce == "append_doc": 

1082 if text != None and (len(list_page_per_doc) == len(text) or (len(list_page_per_doc) == 1 and type(text) == str)): 

1083 print("We expect list_text to be the same object, we force it when length is one (ie ONE DOC) : VR 3-6-25") 

1084 if type(text) == str: 

1085 list_texts = [text] 

1086 try: 

1087 list_texts = split_list_page_by_doc(list_page_content_text, list_page_per_doc) 

1088 except Exception as e: 

1089 print("l 1051 If only one doc it could run !, and maybe also with multiple doc by the way") 

1090 print(str(e)) 

1091 aux_input_var = "list_page_content" 

1092 elif strat_reduce == "concat_stride": # TODO add test for these arguments 

1093 list_texts = split_text(text, param["size"], param["overlap"]) 

1094 elif strat_reduce == "run_one_datou": 

1095 # default behavior set internal datou id for run_one_datou 

1096 if "datou_exec_info" in input and "mtr_datou_id" in input["datou_exec_info"]: 

1097 curr_datou_id = input["datou_exec_info"]["mtr_datou_id"] 

1098 input["datou_exec_info"]["mtr_datou_id"] = datou_int_id 

1099# VR42 : toadd, sans doute seulement si with_uadi ou si on a datou_exed_info => et on en aurait besoin tout le temps à mon avis ! 

1100 if "datou_exec_info" in input: 

1101 input["datou_exec_info"]["mtr_datou_id"] = datou_int_id 

1102 list_texts = [input] 

1103 else : 

1104 print("Unsupported : strat_reduce : " + str(strat_reduce)) 

1105 

1106 # user HACK right on datou 

1107 from server.safia import lpgss_singleton, lib_right_singleton 

1108 

1109 # TODO VR : est-ce bien ce que l'on veut faire ? on ne veut pas plutot récupérer le lss dans le contexte d'execution (on aurait alors le user_id) 

1110 from lib.lib_safia_system import LibSafiaSystem 

1111 lss = LibSafiaSystem(lib_user_data_internal=lpgss_singleton, lib_right=lib_right_singleton) 

1112 lss.user_id = 1 # CA c'est un hack 1106 

1113 project_id = input["project_id"] if "project_id" in input else None 

1114 datous = lss.get_datou(datou_int_id, project_id=project_id) 

1115 

1116 datou = None 

1117 for d in datous: 

1118 if d["id"] == datou_int_id: 

1119 datou = d 

1120 break 

1121 

1122 if datou == None: 

1123 print("Datou unavailable " + str(datou_int_id)) 

1124 return {"text" : "Datou unavailable " + str(datou_int_id)} 

1125 

1126 import pandas as pd 

1127 if "df" in input and type(input["df"]) == type(pd.DataFrame): 

1128 df = input["df"] if "df" in input else None 

1129# print("len list_texts : " + str(len(list_texts))) 

1130 print("len(df.values) : " + str(len(df.values))) 

1131 

1132 reduced_result = None 

1133 if strat_reduce == "concat": 

1134 reduced_result = "" 

1135 else : 

1136 print("NOT needed (message isn't correct 16-1-25 ) Unsupported : strat_reduce : " + str(strat_reduce)) 

1137 reduced_result = None 

1138 

1139 list_audit_map_reduce = [] 

1140 id_intern_map = 0 

1141 

1142 hit_main = input["hash_id_treatment"] if "hash_id_treatment" in input else "hit_unknown" 

1143 

1144 privacy = True 

1145 list_param_json_steps = list(map(lambda x: x["param_json"], datou["steps"])) 

1146 list_steps = list(map(lambda x: x["name"], datou["steps"])) 

1147 if parallel : 

1148 print("PALAFI") 

1149 from lib.datou.lib_parallel import datou_parallel_map_reduce 

1150 reduced_result, list_audit_map_reduce = datou_parallel_map_reduce(list_texts, res_json_field, aux_input_var, 

1151 list_steps, list_param_json_steps, param_json, 

1152 verbose, privacy, with_audit, strat_reduce, 

1153 nb_thread, hit_main, id_step_incomplete_args) 

1154 print("PALAVI") 

1155 else: 

1156 # input 

1157 # list_texts, res_json_field, aux_input_var, list_steps, list_param_json_steps, param_json, verbose, privacy, with_audit, strat_reduce, list_audit_map_reduce 

1158 # output 

1159 # reduced_result, list_audit_map_reduce ?? 

1160 reduced_result = aux_map_reduce_loop(list_texts, res_json_field, aux_input_var, list_steps, list_param_json_steps, 

1161 param_json, 

1162 verbose, privacy, with_audit, strat_reduce, reduced_result, list_audit_map_reduce, hit_main, id_step_incomplete_args) 

1163 

1164 document_safia_id = "map_reduce_" + (input["hash_id_treatment"] if "hash_id_treatment" in input else str(uuid4())) 

1165 

1166 data = [ 

1167 { 

1168 "id": document_safia_id, 

1169 "text": reduced_result 

1170 } 

1171 ] 

1172 

1173 if curr_datou_id != None: 

1174 input["datou_exec_info"]["mtr_datou_id"] = curr_datou_id 

1175 output = {"text" : reduced_result, "json_to_save" : data, "list_audit_map_reduce" : list_audit_map_reduce} 

1176 if output["text"] == None: 

1177 del output["text"] 

1178 return output 

1179 

1180 

1181def datou_safia_step_load_tab(input : dict = {}, 

1182 param_json : dict = {}, 

1183 ce : CE = None, 

1184 verbose : bool = False, 

1185 layer_api : LayerGeneric = None) -> dict : 

1186 list_input = ["text"] 

1187 list_output = ["df"] # and others custom 

1188 list_param_json = ["col_to_input"] 

1189 

1190 col_to_input = param_json["col_to_input"] if "col_to_input" in param_json else [] 

1191 output_df = param_json["output_df"] if "output_df" in param_json else "df" 

1192 merge = param_json["merge"] if "merge" in param_json else None 

1193 multi_input = input["multi_input"] if "multi_input" in input else False 

1194 aggregate_multi_input = input["aggregate_multi_input"] if "aggregate_multi_input" in input else False 

1195 saxia_split_end_csv = input["saxia_split_end_csv"].replace(" ", "") if "saxia_split_end_csv" in input else "" 

1196 

1197 max_nb_try = param_json["max_nb_try"] if "max_nb_try" in param_json else 3 

1198 trigger = param_json["trigger"] if "trigger" in param_json else "nb_pages_80" 

1199 retry_step_id = param_json["retry_step_id"] if "retry_step_id" in param_json else 2 

1200 to_be_used = param_json["to_be_used"] if "to_be_used" in param_json else "split_at_10" 

1201 

1202 # markdown, json, auto_detect ?? 

1203 format_input = param_json["format_input"] if "format_input" in param_json else "markdown" 

1204 

1205 {"nb_try": 3, "trigger": "nb_pages_80", "retry_step_id": 2, "to_be_used": "split_at_10"} 

1206 

1207 saxia_all_doc_separated = input["saxia_all_doc_separated"] if "saxia_all_doc_separated" in input else False 

1208 

1209 nb_page = input["nb_page"] if "nb_page" in input else 0 

1210 

1211 if saxia_all_doc_separated: 

1212 print("Need to use only saxia_split_end_csv : " + str(saxia_split_end_csv)) 

1213 print("We should force format_input to json") 

1214 input["result"] = [] 

1215 format_input = "json_as_dict" 

1216 id_page_id = 0 

1217 if saxia_split_end_csv != "": 

1218 saxia_split_end_csv += "," 

1219 saxia_split_end_csv += str(nb_page) 

1220 for list_of_page_read in saxia_split_end_csv.split(","): 

1221 list_of_page_read_int = int(list_of_page_read) 

1222 list_of_page_list = list(range(id_page_id + 1, list_of_page_read_int + 1)) 

1223 id_page_id = list_of_page_read_int 

1224 list_of_page_csv = ",".join(list(map(str, list_of_page_list))) 

1225 one_list_of_page = {"Liste des pages": [list_of_page_csv]} 

1226 # Titre 

1227 # Nombre de pages 

1228 # Commentaires 

1229 # document_type 

1230 input["result"].append(one_list_of_page) 

1231 

1232 if id_page_id != nb_page: 

1233 print("ERROR : saxia_split_end_csv : " + str(saxia_split_end_csv) + " and nb_page : " + str(nb_page)) 

1234 

1235 # find function used in datou_batch to maje df from text 

1236 from lib.batch.lib_batch import create_pandas_table_from_text 

1237 from lib.lib_util import parse_json_from_prompt_result 

1238 

1239 result = input["result"]# if "result" in input else "" 

1240 # Il faut vérifier que result = text fasse planter les tests 

1241 if type(result) == list: 

1242 #result = result[0] 

1243 print(" ERROR treated as WARNING : only the first file will be treated : " + str(result)) 

1244 print("TODO multiple files not implemented yet !") 

1245 if multi_input: 

1246 if aggregate_multi_input: 

1247 # result = " ".join(result) 

1248 print("We will need to do something !") 

1249 else: 

1250 print("internal ERROR : multiple files not implemented yet !") 

1251 

1252 import pandas as pd 

1253 # todo vr 27-12-23 normaliser les input et utilisation des assoc 

1254 if multi_input or saxia_split_end_csv != "": 

1255 print(" For now aggregate_multi_input is set to True by default in case of multi input") 

1256 complete_df = None 

1257 for res in result : 

1258 if format_input == "markdown": 

1259 df = create_pandas_table_from_text(res, verbose = verbose) if "result" in input else pd.DataFrame(columns=["Init"]) 

1260 elif format_input == "json": 

1261 df = pd.DataFrame(parse_json_from_prompt_result(res)) 

1262 elif format_input == "json_as_dict": 

1263 df = pd.DataFrame(res) 

1264 else: 

1265 print("format_input " + format_input + " not implemented yet ! ") 

1266 if type(complete_df) == types.NoneType: 

1267 complete_df = df 

1268 else: 

1269 complete_df = pd.concat([complete_df, df], axis=0, ignore_index=True) 

1270 df = complete_df 

1271 else: 

1272 if format_input == "markdown": 

1273 df = create_pandas_table_from_text(result, verbose = verbose) if "result" in input else pd.DataFrame(columns=["Init"]) 

1274 elif format_input == "json": 

1275 df = pd.DataFrame(parse_json_from_prompt_result(result)) 

1276 else: 

1277 print("format_input " + format_input + " not implemented yet ! ") 

1278 

1279 output = {output_df : df} 

1280 for col in col_to_input: 

1281 strat = col["strat"] if "strat" in col else "concat_sccsv" 

1282 col_name = col["col_name"] if "col_name" in col else None 

1283 input_name = col["input_name"] if "input_name" in col else None 

1284 all_data_to_clean = list(df[col_name]) if type(df) == pd.DataFrame and col_name in df.columns else [] 

1285 # TODO better car mieux vaut quick and dirty que de ne pas avancer, vraisemblablement VR 20-12-23 

1286 try: 

1287 all_data = [d.replace(" ", "") for d in all_data_to_clean] # .replace("-", ",") = VR 14-2-24 a present gérer par le parsing begin_end manuelle avec 3 etats 

1288 except : 

1289 all_data = all_data_to_clean 

1290 io_data_datou = ";".join(all_data) 

1291 if input_name : 

1292 output[input_name] = io_data_datou 

1293 

1294 if merge != None or trigger == "nb_pages_80": 

1295 df_by_page = df 

1296 df_by_document = df # input["df"] if "df" in input else None 

1297 # Faire une boucle sur les documents et les pages pour vérifier que c'est correct 

1298 # TODO VR 27-12-23 : faire un test pour vérifier que c'est correct 

1299 print(" In datou_safia_step_load_tab ") 

1300 try: 

1301 res1 = df_by_document.to_markdown() 

1302 res2 = df_by_page.to_markdown() 

1303 res3 = df_by_document.to_json() 

1304 res4 = df_by_page.to_json() 

1305 if verbose : 

1306 print(res1) 

1307 print(res2) 

1308 print(res3) 

1309 print(res4) 

1310 except Exception as e: 

1311 print(str(e)) 

1312 try: 

1313 # La liste des pages dans df_by_document a pour nom de colonne "Liste des pages" 

1314 # L'information dans df_by_page sur le fait d'etre le debut ou la fin d'un document est dans la colonne Information_debut_fin 

1315 # Pour chaque ligne de df_by_document, on va chercher les pages correspondantes dans df_by_page et vérifier qu'il n'y a qu'une seule page debut et fin de document sinon on renvoie un mesage d'erreur 

1316 # Merci de faire une boucle sur df_by_document et pour chaque ligne de df_by_document, on va chercher les pages correspondantes dans df_by_page et vérifier qu'il n'y a qu'une seule page debut et fin de document sinon on renvoie un mesage d'erreur 

1317 list_all_page = [] 

1318 for index, row in df_by_document.iterrows(): 

1319 liste_des_pages = row["Liste des pages"] 

1320 from lib.lib_util import parse_list_page_as_begin_end_separated 

1321 try: 

1322 list_page_one_document = list(map(int, liste_des_pages.split(","))) 

1323 except Exception as e: 

1324 print("Trying to parse list_page_per_doc as begin and end separated : " + str(e)) 

1325 list_page_one_document = parse_list_page_as_begin_end_separated(liste_des_pages) 

1326 print("list_page_one_document : " + str(list_page_one_document)) 

1327 if len(list_page_one_document) == 0: 

1328 print("ERROR") 

1329 

1330 list_all_page.extend(list_page_one_document) 

1331 if False: 

1332 list_info_debut_fin = [] 

1333 for page_nb in list_page_one_document: 

1334 df_by_page_one_page = df_by_page[df_by_page["Numéro de La Page"] == page_nb] 

1335 Information_debut_fin = df_by_page_one_page["Information_debut_fin"].values 

1336 list_info_debut_fin.append(Information_debut_fin) 

1337 # Maintenant on compte les multiples 

1338 count_begin = list_info_debut_fin.count(lambda x : "debut" in x.lower()) 

1339 count_end = list_info_debut_fin.count(lambda x : "fin" in x.lower()) 

1340 if count_begin > 1 or count_end > 1: 

1341 print("ERROR OR WARNING multiple debut fin : " + str(list_info_debut_fin)) 

1342 

1343 # On va vérifier que toutes les pages de df_by_page sont bien dans df_by_document sinon on les rajoute séparément comme un document à chaque fois 

1344 if list_all_page != list(set(list_all_page)): 

1345 print("ERROR OR WARNING Multiple page : " + str(list_all_page)) 

1346 

1347 if set(range(1, nb_page)) != set(list_all_page): # nb_page 

1348 print("ALL page : " + str(list_all_page)) 

1349 list_missing_page = list(set(range(1, nb_page)) - set(list_all_page)) 

1350 for missing_page in list_missing_page: 

1351 import pandas as pd 

1352 df_by_document = pd.concat([df_by_document, pd.DataFrame({"Liste des pages" : str(missing_page), "document_type" : "Added for completion"}, index=[len(df_by_document)])], ignore_index=True) 

1353 

1354 if trigger == "nb_pages_80": 

1355 if len(list_missing_page) > 0.2 * float(nb_page): 

1356 output["retry"] = True 

1357 output["retry_step_id"] = retry_step_id 

1358 output["max_nb_try"] = max_nb_try 

1359 print("TRIGGER RETRY IN LOAD TAB") 

1360 if to_be_used == "split_at_10": 

1361 print("Ici on pourrait reconstuirer les texts de input selon un split pris en entrée ou bien un split par défaut par exemple toutes les 10 ou 15 pages") 

1362 print("OUI ! Ici on pourrait reconstuirer les texts de input selon un split pris en entrée ou bien un split par défaut par exemple toutes les 10 ou 15 pages") 

1363 

1364 except Exception as e: 

1365 if trigger == "nb_pages_80": 

1366 output["retry"] = True 

1367 output["retry_step_id"] = retry_step_id 

1368 output["max_nb_try"] = max_nb_try 

1369 print("ERROR IN TRIGGER RETRY IN LOAD TAB") 

1370 print(str(e)) 

1371 

1372 # On va vérifier que l'ordre des pages d'un sous-document n'est pas modifié sinon on renvoie un message 

1373 

1374# if trigger == "nb_pages_80": 

1375 

1376# print("ERROR OR WARNING : nb_pages_80 : " + str(len(df))) 

1377 

1378 if saxia_all_doc_separated: 

1379 print("TO TEST !") 

1380 if "result" in input: 

1381 print("WARNING saxia_all_doc_separated Not expected ! ") 

1382 else : 

1383 input["result"] = "" 

1384 if "text_only_for_meta_data_and_not_split" in input: 

1385 input["text"] = input["text_only_for_meta_data_and_not_split"] 

1386 else: 

1387 print("All split csv should be tested more carefully, missing key text_only_for_meta_data_and_not_split") 

1388 del input["text_only_for_meta_data_and_not_split"] 

1389 

1390 if type(output[output_df]) != types.NoneType: 

1391 try: 

1392 output[output_df]["prediag"] = "MISSING" 

1393 if "prediag_csv" in input and input["prediag_csv"] != "" and input["prediag_csv"] != None: 

1394 print("We can add prediag in df but we need list_of_list_of_file") 

1395 # ca ca ne va pas : list_page_per_doc 

1396# list_page_per_doc = input["list_page_per_doc"] if "list_page_per_doc" in input else ";".join(list(map(str, list(range(1, nb_page + 1))))) 

1397# list_of_list_of_page_per_doc = list(map(lambda x: list(map(int, x.split(","))), list_page_per_doc.split(";"))) 

1398 

1399 prediag_csv = input["prediag_csv"] 

1400 list_prediag = prediag_csv.split(",") 

1401 if len(list_prediag) == nb_page: 

1402 # iterer sur le df et pour chaque ligne, on va chercher la page correspondante dans list_page_per_doc 

1403 for index, row in output[output_df].iterrows(): 

1404 liste_des_pages = row["Liste des pages"] 

1405 list_page_one_document = [] 

1406 if liste_des_pages != '': 

1407 list_page_one_document = list(map(int, liste_des_pages.split(","))) 

1408 # On va chercher la page correspondante dans list_page_per_doc 

1409 sub_prediag_csv = "" 

1410 for page_nb in list_page_one_document: 

1411 if page_nb - 1 < len(list_prediag): 

1412 if sub_prediag_csv != "": 

1413 sub_prediag_csv += "," 

1414 sub_prediag_csv += list_prediag[page_nb - 1] 

1415 else: 

1416 print("PROBLEMA CHECK !") 

1417 output[output_df].loc[index, "prediag"] = sub_prediag_csv 

1418 else: 

1419 print("PROBLEMB CHECK : " + str(nb_page) + " len(list_prediag) : " + str(len(list_prediag))) 

1420 else : 

1421 print("MISSING PREDIAG PROBLEMC CHECK !, TO DO PLEASE") 

1422 except Exception as e: 

1423 print(str(e)) 

1424 else: 

1425 print("ERROR CHECK : df is None") 

1426 

1427 print("END OF LOAD TAB") 

1428 return output 

1429 

1430 

1431 

1432# Keep for easy implementation of new function (remove pragma no cover and complete all) 

1433# INSERT INTO mtrdatou.datou_step_template (name, function_name, param_json_list, input_list, output_list) VALUES ('anon', 'datou_safia_step_anon', '["openai_token"]', '["list_page_content"]', '["result"]'); 

1434def datou_safia_step_anon(input : dict = {}, 

1435 param_json : dict = {}, 

1436 ce : CE = None, 

1437 verbose : bool = False, 

1438 layer_api : LayerGeneric = None) -> dict : 

1439 list_input = [] 

1440 list_output = [] 

1441 list_param_json = [] 

1442 

1443 keyword = param_json["keyword"] if "keyword" in param_json else {} 

1444 exclude_word_split = param_json["exclude_word_split"] if "exclude_word_split" in param_json else [] 

1445 word_to_keep = param_json["word_to_keep"] if "word_to_keep" in param_json else [] 

1446 exclude_pers = param_json["exclude_pers"] if "exclude_pers" in param_json else ["docteur"] 

1447 exclude_bib_start = param_json["exclude_bib_start"] if "exclude_bib_start" in param_json else ["docteur ", "dr. ", "dr "] 

1448 append_detected_to_output = bool(param_json["append_detected_to_output"]) if "append_detected_to_output" in param_json else False 

1449 anon_all_unread = bool(param_json["anon_all_unread"]) if "anon_all_unread" in param_json else False 

1450 # list of keys for which we remove the info in order to make anonymisation 

1451 remove_search_string_for_key_private_data = param_json["remove_search_string_for_key_private_data"] if "remove_search_string_for_key_private_data" in param_json else [] 

1452 

1453 result_info_to_anon = input["result"] if "result" in input else "" 

1454 list_page_content = input["list_page_content"] if "list_page_content" in input else [] 

1455 

1456 all_paragraphs = param_json["data"]["par"] if "data" in param_json and "par" in param_json["data"] else [] 

1457 paragraphs_to_anon = [] 

1458 for one_page in all_paragraphs: 

1459 one_page_to_anon = [] 

1460 for p in one_page: 

1461 if "class" in p and p["class"] != "content": 

1462 p["xmin"] = p["x"] 

1463 p["ymin"] = p["y"] 

1464 p["xmax"] = p["x"] + p["w"] 

1465 p["ymax"] = p["y"] + p["h"] 

1466 p["old_text"] = p["text"] 

1467 p["text"] = "KKK" 

1468 one_page_to_anon.append(p) 

1469 paragraphs_to_anon.append(one_page_to_anon) 

1470 

1471 hash_id_treatment = input["hash_id_treatment"] if "hash_id_treatment" in input else str(uuid4()) 

1472 out_folder = input["out_folder"] if "out_folder" in input else None # "static/temp/anon" 

1473# out_folder = None 

1474 

1475 from lib.anon.lib_anon import anon_document 

1476 

1477 from lib.lib_util import create_prefix_file_name_from_json_prefix 

1478 prefix_file = create_prefix_file_name_from_json_prefix(input["prefix_file"]) if "prefix_file" in input else "" 

1479 paragraphs_to_anon_copy = paragraphs_to_anon.copy() 

1480 list_pngs, pdf_anon, json_info_to_anon = anon_document(result_info_to_anon, list_page_content, verbose=False, 

1481 keyword = keyword, hash_id_treatment=hash_id_treatment, 

1482 prefix_file = prefix_file, 

1483 word_to_keep = word_to_keep, 

1484 exclude_word_split=exclude_word_split, 

1485 anon_all_unread=anon_all_unread, 

1486 remove_search_string_for_key_private_data=remove_search_string_for_key_private_data, 

1487 exclude_pers = exclude_pers, 

1488 exclude_bib_start = exclude_bib_start, 

1489 out_folder=out_folder, 

1490 paragraphs_to_anon=paragraphs_to_anon_copy) 

1491 

1492 if verbose: 

1493 print(" pdf_anon : " + str(pdf_anon)) 

1494 

1495 output = {"pdf_anon" : pdf_anon} 

1496 

1497 if append_detected_to_output: 

1498 for key in json_info_to_anon: 

1499 if type(json_info_to_anon[key]) == list: 

1500 json_info_to_anon[key] = ";".join(json_info_to_anon[key]) 

1501 output.update(json_info_to_anon) 

1502 

1503 return output 

1504 

1505 

1506 

1507 

1508# INSERT INTO mtrdatou.datou_step_template (name, function_name, param_json_list, input_list, output_list) VALUES ('format', 'datou_safia_step_format', '["format_info"]', '["result"]', '["result"]'); 

1509def datou_safia_step_format(input : dict = {}, 

1510 param_json : dict = {}, 

1511 ce : CE = None, 

1512 verbose : bool = False, 

1513 layer_api : LayerGeneric = None) -> dict : 

1514 list_input = [] 

1515 list_output = [] 

1516 list_param_json = [] 

1517 config_project = param_json["config_project"] if "config_project" in param_json else {} 

1518 format = config_project["saxia"]["format"] if "saxia" in config_project and "format" in config_project["saxia"] else {} 

1519 format_json_from_conf = format["info_format_intro"] if "info_format_intro" in format else {} 

1520 default_format_intro_hc = "Le {datet}, {document_type} par le Docteur {medecin_nom}, {medecin_specialite} :" 

1521 format_info = format_json_from_conf["format"] if "format" in format_json_from_conf else {} 

1522 info_format_intro = format_json_from_conf["format"]["intro"] if "format" in format_json_from_conf and "intro" in format_json_from_conf["format"] else {"default": default_format_intro_hc} 

1523 

1524 exec_if_true = param_json["exec_if_true"] if "exec_if_true" in param_json else True 

1525 if not exec_if_true or exec_if_true == {}: 

1526 print(" dont_exec_if_false is True, we skip the formatting step ") 

1527 return input 

1528 print("PAssed exec if true formatting step ") 

1529 

1530 append_resume = param_json["append_resume"] if "append_resume" in param_json else False 

1531 content_resume = input["content_resume"] if "content_resume" in input else "" 

1532 

1533 try : 

1534 print(" keys input : " + str(input.keys())) 

1535 length_input = {k : len(input[k]) if (type(input[k]) != bool and str(type(input[k])) != "<class 'NoneType'>") else 0 for k in input} 

1536 print(" length_input : " + str(length_input)) 

1537 except Exception as e: 

1538 print("ERROR Problem with input : " + str(e) + " treated as WARNING ") 

1539 

1540# "compte_rendu_complet_medecin": "from_json_copy" 

1541 compte_rendu_complet_medecin = param_json["compte_rendu_complet_medecin"] if "compte_rendu_complet_medecin" in param_json else "" 

1542 

1543 list_class_copy = param_json["list_class_copy"] if "list_class_copy" in param_json else [] 

1544 append_table_doc = (param_json["append_table_doc"] == 1 or param_json["append_table_doc"].lower() == "true") if "append_table_doc" in param_json else False 

1545 append_table_page = (param_json["append_table_page"] == 1 or param_json["append_table_page"].lower() == "true") if "append_table_page" in param_json else False 

1546 with_hyperlink = (param_json["with_hyperlink"] == 1 or str(param_json["with_hyperlink"]).lower() == "true") if "with_hyperlink" in param_json else False 

1547 append_parsing_meta_info_to_table = (param_json["append_parsing_meta_info_to_table"] == 1 or str(param_json["append_parsing_meta_info_to_table"]).lower() == "true") if "append_parsing_meta_info_to_table" in param_json else False 

1548 reproduce_format_new_page = (param_json["reproduce_format_new_page"] == 1 or str(param_json["reproduce_format_new_page"]).lower() == "true") if "reproduce_format_new_page" in param_json else False 

1549 reorder_paragraph_by_order_lex_token = (param_json["reorder_paragraph_by_order_lex_token"] == 1 or str(param_json["reorder_paragraph_by_order_lex_token"]).lower() == "true") if "reorder_paragraph_by_order_lex_token" in param_json else False 

1550 smart_new_line_from_token_pos = (param_json["smart_new_line_from_token_pos"] == 1 or str(param_json["smart_new_line_from_token_pos"]).lower() == "true") if "smart_new_line_from_token_pos" in param_json else False 

1551 order_by_date = (param_json["order_by_date"] == 1 or str(param_json["order_by_date"]).lower() == "true") if "order_by_date" in param_json else False 

1552 order_by_document_type = (param_json["order_by_document_type"] == 1 or str(param_json["order_by_document_type"]).lower() == "true") if "order_by_document_type" in param_json else False 

1553 result_input = input["result"] if "result" in input else "" 

1554 df = input["df"] if "df" in input else "None" 

1555 df_by_page = input["df_by_page"] if "df_by_page" in input else "None" 

1556 input_col_intro = input["input_col_intro"] if "input_col_intro" in input else "intro_correct_typo" 

1557 input_col_cr = input["input_col_cr"] if "input_col_cr" in input else "cr_correct_typo" 

1558 load_df_from_db_and_correct = (str(input["load_df_from_db_and_correct"]) == "1" or str(input["load_df_from_db_and_correct"]).lower() == "true") if "load_df_from_db_and_correct" in input else False 

1559 out_file = input["out_file"] if "out_file" in input else "" 

1560 nb_blank_line = param_json["nb_blank_line"] if "nb_blank_line" in param_json else 0 

1561 

1562 hash_id_treatment = input["hash_id_treatment"] if "hash_id_treatment" in input else str(uuid4()) 

1563 

1564 from lib.lib_util import parse_json_from_prompt_result, format_one_res, complete_date_and_order_json_to_mettre_en_forme, append_id_by_order 

1565 list_json_to_mettre_en_forme = parse_json_from_prompt_result(result_input) 

1566 list_json_to_mettre_en_forme = append_id_by_order(list_json_to_mettre_en_forme) 

1567 

1568 import pandas as pd 

1569 nb_doc = len(df) if type(df) == pd.DataFrame else 0 

1570 nb_page_from_df = len(df_by_page) if type(df_by_page) == pd.DataFrame else 0 

1571 

1572# if order_by_date : 

1573# list_json_to_mettre_en_forme = complete_date_and_order_json_to_mettre_en_forme(list_json_to_mettre_en_forme) 

1574 

1575 from lib.lib_util import create_prefix_file_name_from_json_prefix 

1576 prefix_file = create_prefix_file_name_from_json_prefix(input["prefix_file"]) if "prefix_file" in input else "" 

1577 

1578 df_complet_as_markdown = "" 

1579 df_complet_as_json = '' 

1580 nb_modif_manual = -1 

1581 nb_modif_class_manual = -1 

1582 nb_manual_action_df = -1 

1583 nb_manual_action_df_for_col_audit = -1 

1584 total_text = "" 

1585 result_output = "" # TODO duplicate 

1586 if verbose : 

1587 print(" load_df_from_db_and_correct : " + str(load_df_from_db_and_correct)) 

1588 else : 

1589 print(" load_df_from_db_and_correct : " + str(load_df_from_db_and_correct)[:100]) 

1590 out_folder = input["out_folder"] if "out_folder" in input else "temp" 

1591 

1592 from server.safia import lpgss_singleton 

1593 project_id = input["project_id"] if "project_id" in input else param_json["project_id"] if "project_id" in param_json else 70 

1594 conf_project = lpgss_singleton.load_conf_project(project_id) 

1595 format_json_from_conf = conf_project["saxia"]["format"]["info_format_intro"] if "saxia" in conf_project and "format" in conf_project["saxia"] and "info_format_intro" in conf_project["saxia"]["format"] else {} 

1596 info_format_intro_bis = format_json_from_conf["format"]["intro"] if "format" in format_json_from_conf and "intro" in format_json_from_conf["format"] else {"default": default_format_intro_hc} 

1597 

1598 print(" info_format_intro and info_format_intro_bis should be the same !") 

1599 

1600 outfile_name_docx = "" 

1601 audit_info_count = {} 

1602 audit_info_write = {} 

1603 # consolidate 

1604 if load_df_from_db_and_correct: 

1605 from lib.lib_safia_system import LibSafiaSystem 

1606 user_id = 0 

1607 from server.safia import lib_right_singleton 

1608 lss = LibSafiaSystem(lib_user_data_internal=lpgss_singleton, lib_right=lib_right_singleton) 

1609 lss.user_id = user_id # CA c'est un hack 

1610 hash_id_treatment = input["hash_id_treatment_input"] if "hash_id_treatment_input" in input else "default_value_hash_id_treatment" 

1611 

1612 from lib.manaudit.lib_datou_audit import load_audit_info_and_apply_manual_correction, list_action_by_user, count_time_lab_by_user 

1613 try : 

1614 

1615 df_auto, df_cons, hash_id_treatment_rerun, results, audit_info_count, id_file, audit_json_file_content_as_json, all_results, info_date = load_audit_info_and_apply_manual_correction(hash_id_treatment_auto = hash_id_treatment, 

1616 hash_id_treatment_manual = hash_id_treatment, 

1617 lpgss = lss.lib_user_data_internal, 

1618 project_id = project_id) 

1619 

1620 map_user_id_list_page, map_user_id_time_modif, map_user_id_list_pages_for_split = list_action_by_user(all_results, df_auto) 

1621 map_interval_sum_by_user = count_time_lab_by_user(all_results) 

1622 nb_correction_split = sum(list(map(lambda x: len(map_user_id_list_pages_for_split[x]), map_user_id_list_pages_for_split))) 

1623 if nb_correction_split == 0: 

1624 split_auto_perfect = True 

1625 else: 

1626 split_auto_perfect = False 

1627 if len(map_user_id_list_page) == 2: 

1628 print("We expect a labeliser and a corrector ") 

1629 user_id_0 = list(map_user_id_list_page.keys())[0] 

1630 user_id_1 = list(map_user_id_list_page.keys())[1] 

1631 nb_correction_0 = len(map_user_id_list_page[user_id_0]) 

1632 nb_correction_1 = len(map_user_id_list_page[user_id_1]) 

1633 if nb_correction_0 < nb_correction_1: 

1634 user_id_labeliser = user_id_1 

1635 user_id_corrector = user_id_0 

1636 elif nb_correction_0 == nb_correction_1: 

1637 print("WARNING EQUAL NUMBER OF CORRECTION BETWEEN LABELISER AND CORRECTOR, WE TAKE THE FIRST AS LABELISER") 

1638 user_id_labeliser = user_id_0 

1639 user_id_corrector = user_id_1 

1640 else : 

1641 user_id_labeliser = user_id_0 

1642 user_id_corrector = user_id_1 

1643 

1644 nb_page_no_correction = len(map_user_id_list_page[user_id_labeliser]) - len(map_user_id_list_page[user_id_corrector]) 

1645 max_nb_page = max(map_user_id_list_page[user_id_labeliser]) 

1646 time_minute_labelizer = map_interval_sum_by_user[user_id_labeliser]["total_minutes"] if user_id_labeliser in map_interval_sum_by_user and "total_minutes" in map_interval_sum_by_user[user_id_labeliser] else 0.0 

1647 time_minute_corrector = map_interval_sum_by_user[user_id_corrector]["total_minutes"] if user_id_corrector in map_interval_sum_by_user and "total_minutes" in map_interval_sum_by_user[user_id_corrector] else 0.0 

1648 nb_interval_labelizer = len(map_interval_sum_by_user[user_id_labeliser]["intervals"]) if user_id_labeliser in map_interval_sum_by_user and "intervals" in map_interval_sum_by_user[user_id_labeliser] else 0 

1649 nb_interval_corrector = len(map_interval_sum_by_user[user_id_corrector]["intervals"]) if user_id_corrector in map_interval_sum_by_user and "intervals" in map_interval_sum_by_user[user_id_corrector] else 0 

1650 info_correction = " nb_page_no_correction : " + str(nb_page_no_correction) + " pourcentage perfect : " + str(int(100 * float(nb_page_no_correction / float(max_nb_page)))) + \ 

1651 " user_id_labeliser : " + str(user_id_labeliser) + " with " + str(len(map_user_id_list_page[user_id_labeliser])) + f" corrections in {time_minute_labelizer:.2f} minutes in {nb_interval_labelizer} intervals " + \ 

1652 " user_id_corrector : " + \ 

1653 str(user_id_corrector) + " with " + str(len(map_user_id_list_page[user_id_corrector])) + f" corrections in {time_minute_corrector:.2f} minutes in {nb_interval_corrector} intervals " + \ 

1654 ", nb_page_no_correction : " + str(nb_page_no_correction) 

1655 nb_page_perfect = nb_page_no_correction 

1656 else : 

1657 time_minute_labelizer = -1 

1658 time_minute_corrector = -1 

1659 nb_interval_labelizer = -1 

1660 nb_interval_corrector = -1 

1661 info_correction = str(len(map_user_id_list_page)) + " users found in correction, we cannot separate labeliser and corrector : " + str(map_user_id_list_page) 

1662 

1663 if time_minute_labelizer == 0.0 or time_minute_corrector == 0.0: 

1664 info_correction += " WARNING time_minute_labelizer : " + str(time_minute_labelizer) + " or time_minute_corrector : " + str(time_minute_corrector) + " is zero " 

1665 info_correction += " map_interval_sum_by_user : " + str(map_interval_sum_by_user) 

1666 

1667 info_correction += " split_auto_perfect : " + str(split_auto_perfect) + " nb_correction_split : " + str(nb_correction_split) 

1668 

1669 #if type(df_auto_as_json) != types.NoneType: 

1670 # df_auto = pd.DataFrame(df_auto_as_json, convert_dates=["datet", "date_entree_hospitalisationt", "date_sortie_hospitalisationt", "date_fin_arret_travailt", "date_debut_arret_travailt"]) 

1671 nb_modif_manual = sum(list(map(lambda x: len(x["manual_input_info"]["list_actions"]), all_results))) 

1672 try: 

1673 # nb_modif_class_manual = sum(list(map(lambda x: np.count_nonzero([0 if a["type_action"] != "class_paragraph" else 1 for a in x["manual_input_info"]["list_actions"]]), all_results))) 

1674 nb_modif_class_manual = audit_info_count["nb_modif_class_manual"] if "nb_modif_class_manual" in audit_info_count else -2 

1675 nb_manual_action_df = audit_info_count["nb_manual_action_df"] if "nb_manual_action_df" in audit_info_count else -2 

1676 nb_manual_action_df_for_col_audit = audit_info_count["nb_manual_action_df_for_col_audit"] if "nb_manual_action_df_for_col_audit" in audit_info_count else -2 

1677 print(" Faisons mieux au dessus ") 

1678 except Exception as e: 

1679 print("Error while counting class_paragraph : " + str(e)) 

1680 

1681 df = results["df"] 

1682 

1683# id_file = all_result["id_file"] if "id_file" in all_result else "" 

1684 

1685 nb_doc = len(df) 

1686 try: 

1687 nb_page = len(audit_json_file_content_as_json["io_exec"]["3"]["input"]["paragraphs"]) 

1688 except Exception as e: 

1689 print(str(e)) 

1690 nb_page = -1 

1691 

1692 print("TODO warning 14-5-24 ce code doit etre dedupliquer vu que c'est le meme dans les deux parties de la condition consolidate et l'autre") 

1693 if order_by_date: 

1694 from lib.lib_util import order_df_by_date 

1695 df = order_df_by_date(df) 

1696 

1697 if order_by_document_type: 

1698 from lib.lib_util import order_by_document_type 

1699 df = order_by_document_type(df) 

1700 

1701 # Argument to modularize : df, input_col_intro, input_col_cr, out_file, hash_id_treatment 

1702 # Output : total_text 

1703 # why not output nb_file, nb_page, nb_modif_manual, 

1704 from lib.lib_util import write_table_list_inner_document_0424_bis 

1705 out_file = id_file + "_h_" + out_file 

1706 total_text, outfile_name_docx, audit_info_write = write_table_list_inner_document_0424_bis(df, input_col_intro, input_col_cr, out_file, hash_id_treatment, out_folder, format_info, verbose=verbose, 

1707 content_resume=content_resume, append_resume=append_resume) 

1708 

1709 except Exception as e: 

1710 print("Error while loading and correcting df : " + str(e)) 

1711 

1712 else: # format from initial run (not consolidate) 

1713 df_auto = None 

1714 print(" df len 1058 : " + str(len(df))) 

1715 document = None 

1716 out_file = "" 

1717 

1718 print(" df len 1062 : " + str(len(df))) 

1719 

1720 if compte_rendu_complet_medecin == "from_json_copy": 

1721 from lib.lib_util import split_list_page_by_doc 

1722 text = input["text"] if "text" in input else None 

1723 if text != None and ( 

1724 len(input["list_page_per_doc"]) == len(text) or (len(input["list_page_per_doc"]) == 1 and type(text) == str)): 

1725 print( 

1726 "We expect list_text to be the same object, we force it when length is one (ie ONE DOC) : VR 3-6-25") 

1727 if type(text) == str: 

1728# list_texts = [text] 

1729 list_list_page_doc = [[]] # en fait on aura besoin des paragraphs, refacto necessaire ! 

1730 list_list_page_doc = [[]] 

1731 try: 

1732 list_list_page_doc = split_list_page_by_doc(input["paragraphs"], input["list_page_per_doc"]) 

1733 except Exception as e: 

1734 print("l 1051 If only one doc it could run !, and maybe also with multiple doc by the way") 

1735 print(str(e)) 

1736 

1737 else : 

1738 list_list_page_doc = [None] * len(list_json_to_mettre_en_forme) 

1739 

1740 if len(list_list_page_doc) != len(list_json_to_mettre_en_forme): 

1741 print("ERROR TREATED AS WARNING BUT MAKES HUGE ERROR OR MISSING DATA Problem with list_list_page_doc and list_json_to_mettre_en_forme : " + str(len(list_list_page_doc)) + " != " + str(len(list_json_to_mettre_en_forme))) 

1742 

1743 if append_parsing_meta_info_to_table: 

1744 from lib.lib_util import add_parsing_meta_info_to_table 

1745 df = add_parsing_meta_info_to_table(df, list_json_to_mettre_en_forme) 

1746 else: 

1747 print(" ERROR if we do not add 'id' to df, we will not be able to merge with manual correction") 

1748 print(" Furthermore if I want to refacto the loop for exporting automatic datou results, I will have to add 'id' to df and also use df instead of list of json") 

1749 

1750 print(" df len 1077 : " + str(len(df))) 

1751 

1752 format_out_file = "docx" 

1753 out_file = prefix_file + "_" + hash_id_treatment + "." + format_out_file 

1754 

1755 print(" df len 1091 : " + str(len(df))) 

1756 

1757 # not used anymore, set as option or remove 

1758 if False: 

1759 from lib.lib_util import write_table_list_inner_document 

1760 import pandas as pd 

1761 if append_table_doc and type(df) == pd.DataFrame: 

1762 document = write_table_list_inner_document(df, document, with_hyperlink) 

1763 if append_table_page and type(df_by_page) == pd.DataFrame: 

1764 document = write_table_list_inner_document(df_by_page, document, with_hyperlink=False) 

1765 

1766 index_for_hyperlink = 0 

1767 for index, row in df.iterrows(): 

1768 one_json = row.to_dict() 

1769 list_paragraph_doc = [] 

1770 if index < len(list_list_page_doc): 

1771 list_page_doc = list_list_page_doc[index] 

1772 else : 

1773 print(" ERROR missing list page doc and we have not verified the rest !") 

1774 continue 

1775 # et non mais on pourrait faire des verifications TODO 26-4-24 one_json["Liste des pages"] if "Liste des pages" in one_json else "" 

1776# for one_json, list_page_doc in zip(list_json_to_mettre_en_forme, list_list_page_doc): 

1777 try : 

1778 if "document_type" not in one_json: 

1779 document_type = "default" 

1780 print(" Missing document_type in one result " + str(one_json)) 

1781 else: 

1782 if type(one_json["document_type"]) == list: 

1783 document_type = one_json["document_type"][0] 

1784 print(" document_type is a list : " + str(one_json["document_type"]) + " treated as " + str(document_type)) 

1785 else: 

1786 document_type = one_json["document_type"] 

1787 

1788 if compte_rendu_complet_medecin == "from_json_copy": 

1789 print(" How to be sure it is the same doc as in the list_page_doc ?? Et oui je crois que grace à l'id ou le fait qu'on a fusionné !") 

1790 from lib.lib_util import concat_content_from_list_page_doc 

1791 if reorder_paragraph_by_order_lex_token: 

1792 print("WILL FAIL WE NEED TO CHECK IF WE HAVE token in list_page_content ") 

1793 print(" list_page_content : " + str(list_page_content)) 

1794 from_json_content_copy = concat_content_from_list_page_doc(list_page_doc, 

1795 reproduce_format_new_page=reproduce_format_new_page, 

1796 height_line=0, 

1797 reorder_paragraph_by_order_lex_token = reorder_paragraph_by_order_lex_token, 

1798 smart_new_line_from_token_pos = smart_new_line_from_token_pos, 

1799 list_class_copy = list_class_copy) 

1800 one_json["compte_rendu_complet_medecin"] = from_json_content_copy 

1801 print(" from_json_content_copy : " + str(from_json_content_copy)[:100]) 

1802 df.loc[index_for_hyperlink, "cr_back"] = "empty" 

1803 

1804# df.loc[index_for_hyperlink, "cr_back"] = from_json_content_copy.replace("\n\n", "\n") 

1805 df.loc[index_for_hyperlink, ['cr_back']] = [from_json_content_copy.replace("\n\n", "\n")] 

1806 else: 

1807 if type(one_json["compte_rendu_complet_medecin"]) == list: 

1808 print(" ERROR OR WARNING How to handle compte_rendu_complet_medecin as list : " + str(one_json["compte_rendu_complet_medecin"])) 

1809 df.loc[index_for_hyperlink, "cr_back"] = str(one_json["compte_rendu_complet_medecin"]) 

1810 

1811 # TODO remove VR 25-4-24 : certainement useless 

1812 if out_file == "": 

1813 out_file = prefix_file + "_" + hash_id_treatment + "." + format_out_file 

1814 

1815 new_format_info = info_format_intro[document_type] if document_type in info_format_intro else info_format_intro["default"] 

1816 print(" new_format_info : " + str(new_format_info) + " document_type : " + str(document_type) + " info_format_intro.keys : " + str(info_format_intro.keys())) 

1817 

1818 format_premier = format_info["format_premier"] if "format_premier" in format_info else "default" 

1819 format_date = format_info["format_date"] if "format_date" in format_info else "default" 

1820 list_variable_underline = format_info["list_variable_underline"] if "list_variable_underline" in format_info else [] 

1821 list_variable_bold = format_info["list_variable_bold"] if "list_variable_bold" in format_info else [] 

1822 

1823 new_intro = format_one_res(one_json, new_format_info, format_premier, format_date, verbose=verbose, 

1824 list_variable_bold=list_variable_bold, list_variable_underline=list_variable_underline) 

1825 df.loc[index_for_hyperlink, "intro_back"] = new_intro 

1826 # TODO sans doute à garder pratique pour debugguer mais en fait calculer par write ... 

1827 result_output += new_intro 

1828 index_for_hyperlink += 1 # TODO renommer 

1829 

1830 except Exception as e: 

1831 print("Error while parsing one result : " + str(e)) 

1832 

1833 print(" df len 1147 : " + str(len(df))) 

1834 

1835# input_col_intro = "intro_back" 

1836# input_col_cr = "cr_back" 

1837# from lib.lib_util import write_table_list_inner_document_0424_bis 

1838# total_text = write_table_list_inner_document_0424_bis(df, input_col_intro, input_col_cr, out_file, 

1839# hash_id_treatment, out_folder) 

1840 # A refacto 

1841 # with_hyperlink 

1842 # reproduce_format_new_page 

1843 

1844 

1845 try: 

1846 if order_by_date: 

1847 from lib.lib_util import order_df_by_date 

1848 df = order_df_by_date(df) 

1849 except Exception as e: 

1850 print("CHECK !") 

1851 print(str(e)) 

1852 

1853 if order_by_document_type: 

1854 from lib.lib_util import order_by_document_type 

1855 df = order_by_document_type(df) 

1856 

1857 if nb_blank_line > 0: 

1858 from lib.lib_util import add_blank_line 

1859 try: 

1860 df = add_blank_line(df, nb_blank_line) 

1861 except Exception as e: 

1862 print(str(e)) 

1863 

1864 

1865 import pandas as pd 

1866 df_complet_as_markdown = df.to_markdown() if type(df) == pd.DataFrame else "" 

1867 df_complet_as_json = df.to_json() if type(df) == pd.DataFrame else "" 

1868 df_auto = None 

1869 

1870# print(" df_complet_as_markdown len 1152 : " + str(len(df_complet_as_markdown))) 

1871 print(" out_folder : " + out_folder) 

1872 

1873 input_col_intro = "intro_back" 

1874 input_col_cr = "cr_back" 

1875 from lib.lib_util import write_table_list_inner_document_0424_bis 

1876 try: 

1877 total_text, outfile_name_docx, audit_info_write = write_table_list_inner_document_0424_bis(df, input_col_intro, input_col_cr, out_file, 

1878 hash_id_treatment, out_folder, format_info) 

1879 except Exception as e: 

1880 print(str(e)) 

1881 print("Error while computing plop write_table_list_inner_document_0424_bis format") 

1882 

1883 output = {"result" : result_output, "out_file" : out_file, 

1884 "df_complet_as_markdown" : df_complet_as_markdown, 

1885 "df_complet_as_json" : df_complet_as_json, 

1886 "nb_doc" : nb_doc, 

1887 "nb_page_from_df" : nb_page_from_df, 

1888 "nb_word_result" : total_text.count(" ") + 1, 

1889 "nb_modif_manual" : nb_modif_manual, 

1890 "nb_doc_modif_correct_test_2812" : "test_integration_prime_productivite", 

1891 "prime_productivite_test_2812" : "test_integration_prime_productivite", 

1892 "info_correction" : info_correction if 'info_correction' in locals() else "", 

1893 "nb_page_perfect" : nb_page_perfect if 'nb_page_perfect' in locals() else -1, 

1894 "time_minute_labelizer" : time_minute_labelizer if 'time_minute_labelizer' in locals() else -1.0, 

1895 "user_id_labeliser" : user_id_labeliser if 'user_id_labeliser' in locals() else -1, 

1896 "split_auto_perfect" : split_auto_perfect if 'split_auto_perfect' in locals() else None, 

1897 "nb_modif_class_manual" : nb_modif_class_manual, 

1898 "prefix_file" : prefix_file, 

1899 "nb_manual_action_df_for_col_audit" : nb_manual_action_df_for_col_audit, 

1900 "nb_manual_action_df" : nb_manual_action_df, 

1901 "out_folder" : out_folder, 

1902 "df" : df, 

1903 "outfile_name_docx" : outfile_name_docx, 

1904 "content_commemo" : total_text} 

1905 

1906 print(str(audit_info_count.keys())) 

1907 

1908 # Temporaire ! 

1909 path_csv_complete_with_prediag = "/Users/moilerat/Documents/Fotonower/Safia/prompt/misc/csv_prediag_all_almost/all_csv_prediag.csv" 

1910# path_csv_complete_with_prediag = "/home/safia/workarea/git/Safia/prompt/python/misc/csv_prediag_all_almost/all_csv_prediag.csv" 

1911 if os.path.exists(path_csv_complete_with_prediag): 

1912 try: 

1913 import pandas as pd 

1914 df_prediag, _ = pd.read_csv(path_csv_complete_with_prediag, sep=";") 

1915 

1916 from lib.sandbox.migration import migrate_df_complete_with_prediag 

1917 df = migrate_df_complete_with_prediag(df_prediag, df, id_file) 

1918 

1919 except Exception as e: 

1920 print("Error while loading csv complete with prediag : " + str(e)) 

1921 

1922 #if limit == 0: 

1923 print(" if limit == 0 (mais d'ou le trouve t'on grrr) On pourrait aussi insérer le prediag dans le df et le renvoyer dans le json") 

1924 

1925 output["map_count_modif_per_doc"] = audit_info_count["map_count_modif_per_doc"] if "map_count_modif_per_doc" in audit_info_count else {} 

1926 output["map_modif_type_document"] = audit_info_count["map_modif_type_document"] if "map_modif_type_document" in audit_info_count else {} 

1927 output["audit_info_write"] = audit_info_write 

1928 try : 

1929 for col_ref in ["medecin_nom", "medecin_prenom", "document_type", 'Liste des pages', 'Nombre de pages', 'Titre', 'medecin_specialite', 'nom_hopital', 'genre_service_hopital', 

1930 'indication_examen', 'date_entree_hospitalisation', 

1931 'date_sortie_hospitalisation', 'motif_hospitalisation', 

1932 'date_fin_arret_travail', 'TitreMeta', 'datet', 'date_parsed_or_forced', 

1933 'date_fin_arret_travailt', 'date_entree_hospitalisationt', 'date_sortie_hospitalisationt']: 

1934 col_auto = col_ref + "_auto" 

1935 df[col_auto] = "None" 

1936 

1937 if col_ref not in df.columns: 

1938 print("Missing col ref : " + str(col_ref) + " in df") 

1939 continue 

1940 

1941 if type(df_auto) == pd.DataFrame: 

1942 df_auto['Nombre de pages'] = "None" 

1943 for index, row in df_auto.iterrows(): 

1944 if 'Liste des pages' in df_auto.columns: 

1945 df_auto.loc[index, 'Nombre de pages'] = str(len(row['Liste des pages'].split(','))) if type(row['Liste des pages']) == str else "None" 

1946 

1947 if index < len(df): 

1948 found_list = df.index[df['Liste des pages'] == row['Liste des pages']].tolist() 

1949 for col_ref in ["medecin_nom", "medecin_prenom", "document_type", 'Liste des pages', 

1950 'Nombre de pages', 'Titre', 'medecin_specialite', 'nom_hopital', 

1951 'genre_service_hopital', 

1952 'indication_examen', 'date_entree_hospitalisation', 

1953 'date_sortie_hospitalisation', 'motif_hospitalisation', 

1954 'date_fin_arret_travail', 'TitreMeta', 'datet', 'date_parsed_or_forced', 

1955 'date_fin_arret_travailt', 'date_entree_hospitalisationt', 

1956 'date_sortie_hospitalisationt']: 

1957 col_auto = col_ref + "_auto" 

1958# df[col_auto] = "None" 

1959 

1960 if col_ref not in df.columns: 

1961 print("Missing col ref : " + str(col_ref) + " in df") 

1962 continue 

1963 

1964 if len(found_list) == 0: 

1965 print("Missing idx for liste pages " + str(row['Liste des pages'])) 

1966 if len(found_list) > 1: 

1967 print("Warning : more than one idx for liste pages " + str(row['Liste des pages']) + " found : " + str(found_list)) 

1968 

1969 for idxf in found_list: 

1970 df.loc[idxf, col_auto] = row[col_ref] 

1971# df.loc[index, col_auto] = row[col_ref] 

1972 else : 

1973 print("Inconsistent dataframe auto and df : " + str(index) + " > " + str(len(df))) 

1974 

1975 print(" df_auto : " + str(df_auto)) 

1976 output["df_cons"] = df.to_json() 

1977 # Test write and load df.to_dict() 

1978 output["df_cons"] = df.to_dict() 

1979 output["df_auto"] = df_auto.to_json() 

1980 except Exception as e: 

1981 print(str(e)) 

1982 if "hash_id_treatment_input" in input: 

1983 output["hash_id_treatment_input"] = input["hash_id_treatment_input"] 

1984 

1985 return output 

1986 

1987def datou_safia_step_classify_doc(input : dict = {}, 

1988 param_json : dict = {}, 

1989 ce : CE = None, 

1990 verbose : bool = False, 

1991 layer_api : LayerGeneric = None) -> dict : 

1992 list_input = [] 

1993 list_output = [] 

1994 list_param_json = [] 

1995 paragraphs = input["paragraphs"] if "paragraphs" in input else [] 

1996 rules_classifier = param_json["rules_classifier"] if "rules_classifier" in param_json else [] 

1997 task = input["task"] if "task" in input else param_json["task"] if "task" in param_json else "" 

1998 

1999 list_detects = rules_classifier["detect"] if "detect" in rules_classifier else "detect" 

2000 map_classifier = rules_classifier["classify"] if "classify" in rules_classifier else "classify" 

2001 input_format = param_json["input_format"] if "input_format" in param_json else "markdown" 

2002 

2003 # task : re_classifier,prepare_prompt,parse_result,merge_result,classify_doc 

2004 

2005 # "taxonomy_text": {"header":{"key":"H","description":"En-tete et pied de page : adresse(s) de la structure, nom du/des medecins, numéro du secretariat, raison social, Logo, numéro des pages, les titres et diplomes, condition de paiement"},"info_medecin":{- Spécialité et Nom du médecin (des fois en en-tete ou signature (eventuellement tampon) )\n- Date du jour (ou de l'edition)\n- Données personnels du patient : ...\n- Contenu pertinent : \n - A faire selon les classes de document\n - Par défaut\n- Ecriture manuscrite\n- Document administratif (CNI)\n- Autres : Règle administrative, preuve du respect du secret médical\nCertificat établi à la demande de l'intéressé et remis en main propre pour servir et faire valoir ce que de droit.} 

2006 taxonomy_text = param_json["taxonomy_text"] if "taxonomy_text" in param_json else {"taxonomy_text": 

2007 {"header": 

2008 {"key":"H", 

2009 "description":"En-tete et pied de page : adresse(s) de la structure, nom du/des medecins, numéro du secretariat, raison social, Logo, numéro des pages, les titres et diplomes, condition de paiement" 

2010 }, 

2011 "info_medecin": 

2012 {"key":"M", 

2013 "description":"- Spécialité et Nom du médecin (des fois en en-tete ou signature (eventuellement tampon) )\n- Date du jour (ou de l'edition)\n- Données personnels du patient : ...\n- Contenu pertinent : \n - A faire selon les classes de document\n - Par défaut\n- Ecriture manuscrite\n- Document administratif (CNI)\n- Autres : Règle administrative, preuve du respect du secret médical\nCertificat établi à la demande de l'intéressé et remis en main propre pour servir et faire valoir ce que de droit."}, 

2014 "content": 

2015 {"key":"C","description":"Contenu pertinent : description specifique du cas du patient"}, 

2016 "manuscrit": 

2017 {"key":"A","description":"Mots tronqués ou mal orthographier, alphabet étrangère"}, 

2018 "admin":{"key":"D","description":"Document administratif (CNI) ou autre document administratif"}, 

2019 "autre":{"key":"O","description":"Autres : Règle administrative, preuve du respect du secret médical, par exemple le texte : Certificat établi à la demande de l'intéressé et remis en main propre pour servir et faire valoir ce que de droit."} 

2020 } 

2021 } 

2022# map_taxonomy_per_key = {taxonomy_text[k]["key"] : k if "key" in taxonomy_text[k] else k : k for k in taxonomy_text} 

2023 try : 

2024 map_taxonomy_per_key = {taxonomy_text[k]["key"] : k for k in taxonomy_text} 

2025 except Exception as e: 

2026 print(str(e)) 

2027 map_taxonomy_per_key = {} 

2028 

2029 if task == "re_classifier": 

2030 from lib.lib_ml.lib_text_classifier import classify_text 

2031 for page in list_page_content: 

2032 for paragraph in page.list_blocks["paragraphs"]: 

2033 for detect in list_detects: 

2034 res = classify_text(paragraph["text"], list_detects[detect], verbose = verbose) 

2035 if len(res) > 0: 

2036 print("detect : " + str(detect) + " res : " + str(res)) 

2037 if detect in map_classifier: 

2038 paragraph["class"] = map_classifier[detect] 

2039 else : 

2040 print("Missing class, by default we keep it as content_classn but later since this case is not used in production as of 4-3-24") 

2041 elif task == "prepare_prompt": # prepare_prompt_classifier_bib 

2042 from lib.prompt.lib_gen_prompt import generate_prompt_classifier 

2043 preprompt = generate_prompt_classifier(taxonomy_text, verbose = verbose, output_type = input_format) 

2044 # TODO add in input input_type = "json", output_type = "table", 

2045 # language = "fr", 

2046 # output_key = "id", output_class = "classe", 

2047 # separator 

2048 text = str(paragraphs[0]) if len(paragraphs) > 0 else "" 

2049 output = {"preprompt" : preprompt, "text" : text} 

2050 elif task == "parse_result": # parse_result_prompt_classifier_bib 

2051 result = input["result"] if "result" in input else "" 

2052 from lib.batch.lib_batch import create_pandas_table_from_text 

2053 from lib.lib_util import parse_json_from_prompt_result 

2054 

2055 if input_format == "markdown": 

2056 df = create_pandas_table_from_text(result, verbose = verbose) 

2057 elif input_format == "json": 

2058 df = parse_json_from_prompt_result(result, verbose = verbose) 

2059 else : 

2060 print("ERROR Unsupported input_format : " + str(input_format)) 

2061 # df = {} 

2062 print("TODO finish voila (add in document if only one document)") 

2063 if len(paragraphs) != 1: 

2064 print("only one document is managed since we use table as output for prompt !") 

2065 else : 

2066 output_key = "id" 

2067 output_class = "classe" 

2068 for paragraph in paragraphs[0]: 

2069 import sys 

2070 sys.stdout.write("ç") 

2071 if "id" not in paragraph: 

2072 print("Missing id in paragraph : " + str(paragraph)) 

2073 continue 

2074 id = paragraph["id"] 

2075 data_found = df[df["id"] == str(id)] 

2076 key_classes = data_found[output_class] if output_class in data_found else [] 

2077 if len(key_classes) == 1: 

2078 key_class = key_classes[id] 

2079 else: 

2080 print("Missing class") 

2081 key_class = "unknown" 

2082# key_class = df[output_class][df["id"] == str(id)] 

2083 paragraph["class"] = map_taxonomy_per_key[key_class] if key_class in map_taxonomy_per_key else "unknown" 

2084 output = {} 

2085 

2086 elif task == "merge_result": 

2087 pass 

2088 elif task == "classify_doc": 

2089 pass 

2090 elif task == "detect_name_camembert": # context_entity_camembert 

2091 from lib.lib_ml.lib_nlp.lib_pipeline_ner import detect_name_ner 

2092 input_text = input["text"] if "text" in input else "" 

2093 name_pretrained_model = param_json["name_pretrained_model"] if "name_pretrained_model" in param_json else "Jean-Baptiste/camembert-ner" 

2094 name_tokenizer = param_json["name_tokenizer"] if "name_tokenizer" in param_json else "Jean-Baptiste/camembert-ner" 

2095 what_I_want = param_json["what_I_want"] if "what_I_want" in param_json else "PER" 

2096 aggregation_strategy = param_json["aggregation_strategy"] if "aggregation_strategy" in param_json else "simple" 

2097 list_to_treat = [input_text] 

2098 if False and "list_page_content" in input: 

2099 list_page_content = input["list_page_content"] 

2100 list_to_treat = [] 

2101 for l in list_page_content: 

2102# list_to_treat.append(l["text"]) 

2103 list_to_treat.append(l.content) 

2104 

2105 all_list_name = [] 

2106 for input_text_aux in list_to_treat: 

2107 list_name = detect_name_ner(input_text_aux, 

2108 name_pretrained_model=name_pretrained_model, 

2109 name_tokenizer=name_tokenizer, 

2110 what_I_want=what_I_want, 

2111 aggregation_strategy=aggregation_strategy) 

2112 all_list_name.extend(list_name) 

2113 

2114 print(" list_name : " + str(all_list_name)) 

2115 output = {"pers" : all_list_name} 

2116 output = {"pers" : all_list_name, "result" : "```json\n{\"PERS\":[" + ",".join(list(map(lambda x : "\"" + x + "\"", all_list_name))) + "]}\n```"} 

2117 elif task == "parse_result_camembert": # parse_result_camembert 

2118 list_name = input["pers"] if "pers" in input else [] 

2119 

2120 pass 

2121 elif task == "prepare_anon_from_camembert": # parse_result_camembert 

2122 list_name = input["pers"] if "pers" in input else [] 

2123 

2124 pass 

2125 elif task == "create_entity_bib_from_camembert": # parse_result_camembert 

2126 pass 

2127 else : 

2128 print("Unsupported task : " + str(task)) 

2129 output = {"result": "Unsupported task : " + str(task)} 

2130 

2131 import sys 

2132 sys.stdout.write("O") 

2133 return output 

2134 

2135 

2136def datou_safia_step_client(input : dict = {}, 

2137 param_json : dict = {}, 

2138 ce : CE = None, 

2139 verbose : bool = False, 

2140 layer_api : LayerGeneric = None) -> dict : # pragma no cover 

2141 list_input = ["file", "preprompt", "model", "files"] 

2142 list_output = [] 

2143 list_param_json = ["host", "protocol", "port", "end_point", "preprompt"] 

2144 

2145 end_point = param_json["end_point"] if "end_point" in param_json else "api/v1/upload" 

2146 host = param_json["host"] if "host" in param_json else "localhost" 

2147 protocol = param_json["protocol"] if "protocol" in param_json else "http" 

2148 port = param_json["port"] if "port" in param_json else 4998 

2149 

2150 file = input["file"] if "file" in input else "" 

2151 files = input["files"] if "files" in input else [] 

2152 preprompt = input["preprompt"] if "preprompt" in input else param_json["preprompt"] if "preprompt" in param_json else "" 

2153 model = input["model"] if "model" in input else "" 

2154 

2155 import logging 

2156 logger = logging.getLogger() 

2157 logger.info("In datou_safia_step_client l 2081 ") # + str(__line__)) 

2158 

2159 print("file : " + str(file)) 

2160 print("files : " + str(files)) 

2161 print("preprompt : " + str(preprompt)) 

2162 print("model : " + str(model)) 

2163 # Send http form multipart request 

2164 # TODO : send file and preprompt to the server 

2165 # TODO : get result from server 

2166 # TODO : return result 

2167 import requests 

2168 epls = end_point.lstrip('/') 

2169 url = f"{protocol}://{host}:{port}/{epls}" 

2170 if len(files) == 0: 

2171 files.append(file) 

2172 map_res_file = {} 

2173 map_full_res_file = {} 

2174 for f in files: 

2175 import os 

2176 fbn = os.path.basename(f) 

2177 file_json = {'file': open(f, 'rb')} 

2178 logger.info("In datou_safia_step_client l 2102 preprompt " + str(preprompt[:100])) # + str(__line__)) 

2179 

2180 data = {'preprompt': preprompt} #, 'model': model} 

2181 if model != None and model != '': 

2182 data['model_name'] = model 

2183 try : 

2184 response = requests.post(url, files=file_json, data=data) 

2185 if response.status_code == 200: 

2186 print("File uploaded successfully") 

2187 else: 

2188 print("File upload failed") 

2189 

2190 logger.info("In datou_safia_step_client l 2112 response received ") # + str(__line__)) 

2191 logger.info("In datou_safia_step_client l 2112 response.status_code " + str(response.status_code)) # + str(__line__)) 

2192 

2193 print("TO USE TO CREATE NEW STEP") 

2194 

2195 import json 

2196 res_parsed_json = json.loads(response.content.decode('utf-8')) 

2197 except Exception as e: 

2198 print("Error while sending file to server in datou_Step_client : " + str(e)) 

2199 res_parsed_json = {} 

2200 

2201 some_useful_result = res_parsed_json["res"] if "res" in res_parsed_json else [] 

2202 one_useful_result = some_useful_result[0] if len(some_useful_result) > 0 else "" 

2203 result_as_array = one_useful_result.split("assistant<|end_header_id|>") 

2204 if len(result_as_array) > 1: 

2205 the_useful_result = result_as_array[1].replace("\n", "").rstrip("<|eot_id|>") 

2206 else : 

2207 the_useful_result = "" 

2208 

2209 map_res_file[fbn] = the_useful_result 

2210 map_full_res_file[fbn] = res_parsed_json 

2211# assistant<|end_header_id|> 

2212 # 

2213 # TABLEAU<|eot_id|> 

2214 

2215 logger.info("In datou_safia_step_client l 2138 wip ") 

2216 csv_result = "" 

2217 for f in map_res_file: 

2218 if csv_result != "": 

2219 csv_result += "," 

2220 csv_result += map_res_file[f] 

2221 

2222 output = {"map_result" : map_res_file, 

2223 "result" : csv_result, 

2224 "full_result" : map_full_res_file} 

2225 

2226 return output 

2227 

2228 

2229 

2230# Keep for easy implementation of new function (remove pragma no cover and complete all) 

2231# Some time it will be also needed to do DEV DOC : INSERT INTO mtrdatou.datou_step_template (name, function_name, param_json_list, input_list, output_list) VALUES ('append_to_doc', 'datou_safia_step_append_to_doc_content', '["openai_token"]', '["result", "document_id", "project_id", "user_id"]', '["result", "references"]'); 

2232def datou_safia_step_TEMPLATE(input : dict = {}, 

2233 param_json : dict = {}, 

2234 ce : CE = None, 

2235 verbose : bool = False, 

2236 layer_api : LayerGeneric = None) -> dict : # pragma no cover 

2237 list_input = [] 

2238 list_output = [] 

2239 list_param_json = [] 

2240 

2241 print("TO USE TO CREATE NEW STEP") 

2242 

2243 output = {"result" : "some_result (but TEMPLATE)"} 

2244 

2245 return output 

2246 

2247