Coverage for lib/datou/lib_datou_step

1import datetime

2import os.path

3import shutil

4import types

6import numpy as np

8from lib.brick_layers.lib_abstract_generic_layer import LayerGeneric, LayerPrompt

10# TODO ARCHI VR 14-6-23 : est-ce qu'on ferait une classe pour avoir les services de cost_estimation

11from auth.lib_cost import CostEstimation as CE

12from uuid import uuid4

14# speech_to_text

15def datou_safia_step_speech_to_text(input : dict, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict:

16 list_input = ["file"] # TODO VR REFACTO ARCHI : vérifier les entrées et sorties

17 list_output = ["text"]

18 list_param_json = ["openai_token"] # TODO VR REFACTO : avoir une interface d'édition ou bien un context d'ou on récupèrera ces infos (mais ce context dépendra potentiellement du projet pour spécifier le type de modèle, ou d'une "instanciation" d'un datou)

20 file = input["file"]

21 openai_token = param_json["openai_token"]

23 parse_prefix_file = bool(param_json["parse_prefix_file"]) if "parse_prefix_file" in param_json else False

25 # Hard-coded param

26 language = None

28 from lib.lib_speechtotext import speech_to_text

29 text = ""

30 length_time_seconds = 0

31 if file.endswith(".amr") or file.endswith(".ogg") or file.endswith(".mp4") or file.endswith(".webm"):

32 print("import convert_file")

33 from lib.lib_speechtotext import convert_file

34 print("calling convert_file")

35 new_file = convert_file(file)

36 print("calling speech_to_text")

37 text, length_time_seconds, model = speech_to_text(new_file, openai_token, language=language, verbose=verbose)

38# logger.info("Is length_time_seconds null ? " + str(length_time_seconds))

40 # TODO VR REFACTO : je n'aime pas ces deux appels dupliquer

41 elif file.endswith(".mp3") or file.endswith(".m4a") or file.endswith(".wav") :

42 import os

43 size = os.path.getsize(file)

44 print(" size : " + str(size))

46 if size > 10000000: # pragma no cover scale

47 print(" size : " + str(size))

48 from lib.lib_speechtotext import convert_file

49 print("calling convert_file")

50 new_file = convert_file(file)

51 size = os.path.getsize(new_file)

52 print(" size : " + str(size))

54 from lib.lib_speechtotext import split_mp3

55 nb_split = 1 + int(size / 10000000)

56 list_files = split_mp3(new_file, nb_split, verbose=verbose)

58 text = ""

59 length_time_seconds = 0

60 model = ""

61 for file_aux in list_files:

62 text_aux, length_time_seconds_aux, model = speech_to_text(file_aux, openai_token, language=language, verbose=verbose)

63 text += text_aux

64 length_time_seconds += length_time_seconds_aux

65 else :

66 text, length_time_seconds, model = speech_to_text(file, openai_token, language=language, verbose=verbose)

68 ce.compute_cost_search(model, length_time_seconds)

70 # TODO VR REFACTO : il faut aussi effacer les fichiers

72 output = {"text" : text}

74 if "preprompt" in input:

75 output["preprompt"] = input["preprompt"]

77 return output

81def sub_func_read_ocr(f, count, model, verbose, map_file_size, map_file_text, folder_export_boxes,

82 begin_page=False, end_page=False, file_output="output",

83 layer_api = None, vllm_model = None,

84 request_used = None):

86 from lib.lib_util import SubDocPage

87 from lib.lib_ocr import img_to_texte, ocr_google_vision, gcp_doc_ai

88 # VR TODO 9-5-25 Cette boucle existe en double et n'est donc testé qu'une fois, mais on ne sait pas laquelle

89 if model == "tesseract":

90 text, list_boxes, maxx, maxy, list_blocks = img_to_texte(f, verbose)

91 elif model == "prompt":

92 print("Prompt ! ")

93 list_boxes = []

94 maxx = 0

95 maxy = 0

96 list_blocks = {}

98# text = "TODO !"

99 if verbose:

100 print("before call request_gpt")

101 try :

102 if layer_api == None:

103 (result, nb_token, modele) = ("", 0, "")

104 else :

105 text, nb_token, modele = layer_api.prompt(request_used = request_used, gpt_model = vllm_model,

106 verbose = verbose,

107 images = [f])

108 except Exception as e:

109 print(str(e))

110 text, nb_token, modele = "", 0, "ERROR IN PROMPT"

111

112

113 elif model == "gcp_doc_ai":

114 if os.stat(f).st_size > 20000000:

115 print(" Expecting failure due to too big file : " + str(f) + " " + str(os.stat(f).st_size))

116 else:

117 print(" os.stat(f).st_size : " + str(os.stat(f).st_size))

118 try:

119 text, list_boxes, maxx, maxy, list_blocks = gcp_doc_ai(f, verbose=verbose)

120 except Exception as e:

121 print("ERROR TREATED AS WARNING THANKS RECUPERATION : OCR gcp_doc_ao FAILED on " + str(

122 f) + " We wil try the old one ! too bad if it is a cerfa ")

123 print(str(e))

124 text = ""

125 list_boxes = []

126 maxx = 0

127 maxy = 0

128 list_blocks = []

129 text, list_boxes, maxx, maxy, list_blocks = ocr_google_vision(f, verbose)

130 else: # google_ocr

131 text, list_boxes, maxx, maxy, list_blocks = ocr_google_vision(f, verbose)

132 if folder_export_boxes != "":

133 if not os.path.exists(folder_export_boxes):

134 os.makedirs(folder_export_boxes)

135 with open(folder_export_boxes + "/" + file_output + "_" + str(count) + ".json", "w") as of:

136 import json

137 of.write(json.dumps(list_boxes))

138 map_file_size[f] = len(text)

139 map_file_text[f] = text

140 sdp = SubDocPage(count, text, f, list_boxes, maxx, maxy, list_blocks)

141

142 return count, text, sdp

143

144# image_to_text

145def datou_safia_step_image_to_text(input : dict, param_json : dict = {}, ce : CE = None,

146 verbose : bool = False, layer_api : LayerGeneric = None) -> dict:

147 list_input = ["file"] # TODO VR REFACTO ARCHI : vérifier les entrées et sorties

148 list_output = ["text", "preprompt"] # TODO VR REFACTO : ce preprompt devrait etre fait ailleurs

149 list_param_json = ["google_token", "model"] # TODO VR REFACTO : avoir une interface d'édition ou bien un context d'ou on récupèrera ces infos (mais ce context dépendra potentiellement du projet pour spécifier le type de modèle, ou d'une "instanciation" d'un datou)

150

151 model = param_json["model"] if "model" in param_json else "google_ocr"

152 file = input["file"]

153 google_token = param_json["google_token"] if 'google_token' in param_json else None

154 dpi = param_json["dpi"] if "dpi" in param_json else 72

155 if google_token == None:

156 print(" Will crash or not !")

157

158 parse_prefix_file = bool(param_json["parse_prefix_file"]) if "parse_prefix_file" in param_json else True

159 print(" parse_prefix_file : " + str(parse_prefix_file))

160 parse_date_test_before_own_datou_step = bool(param_json["parse_date_test_before_own_datou_step"]) if "parse_date_test_before_own_datou_step" in param_json else False

161 hash_id_treatment = input["hash_id_treatment"] if "hash_id_treatment" in input else None

162 only_count = bool(param_json["only_count"]) if "only_count" in param_json else False

163 only_extract_page = bool(param_json["only_extract_page"]) if "only_extract_page" in param_json else False

164

165 # when multiple file in input as a raw split !

166 multi_input = input["multi_input"] if "multi_input" in input else False

167 saxia_split_end_csv = input["saxia_split_end_csv"].replace(" ", "") if "saxia_split_end_csv" in input else ""

168 saxia_all_doc_separated = input["saxia_all_doc_separated"] if "saxia_all_doc_separated" in input else False

169 use_split_complet = param_json["use_split_complet"] if "use_split_complet" in param_json else False

170 if saxia_all_doc_separated and (saxia_split_end_csv != "") and use_split_complet:

171 print("Here we want to not split with prompt, maybe we should use multi_input and if there is only file w ecould also activate this case ! !")

172 create_output_hit = bool(param_json["create_output_hit"]) if "create_output_hit" in param_json else False

173

174 if type(file) == list:

175 if len(file) == 0:

176 print("ERROR treated as WARNING : No Input file !, we can quit or not, it shouldn't matter ")

177

178 # temporary

179# file = file[0]

180

181 print(" ERROR treated as WARNING : only the first file will be treated : " + str(file))

182 print("TODO multiple files not implemented yet !")

183 if multi_input:

184 print("VR TODO 24/6/24 wip multi_input => is in fact working !")

185 else :

186 print(" We have not decided yet the default behavior VR TODO CDC 24/6/24 : for now this is an internal error to be in this situation")

187 one_file_reference = file[0]

188 else:

189 one_file_reference = file

190 file = [file]

191

192 if one_file_reference == None:

193 print(" ERROR treated as WARNING : No Input file reference !, we can quit or not, it shouldn't matter ")

194 size_file = os.stat(one_file_reference).st_size

195 created_at = datetime.datetime.fromtimestamp(os.stat(one_file_reference).st_ctime)

196 in_folder = os.path.dirname(one_file_reference)

197 work_folder_images = os.path.dirname(one_file_reference)

198

199 if one_file_reference.lower().endswith(".pdf"):

200 from lib.lib_util import from_pdf_to_list_pngs

201 list_pngs, count_per_doc, list_of_list_of_pages = from_pdf_to_list_pngs(file, dpi = dpi, hash_id_treatment = hash_id_treatment, only_count = only_count)

202 else :

203 if multi_input:

204 print("Internal error as of 24/6/24, behavior to be developped CDC TODO VR 24/6/24")

205 list_pngs = [one_file_reference]

206 list_of_list_of_pages = [[1]]

207

208 if len(list_pngs) == 0:

209 print("TO activate after some test !")

210# saxia_all_doc_separated = True

211

212 if saxia_all_doc_separated:

213 print(" We should avoid doing split with prompt and treat all different case !")

214

215 from lib.lib_util import parse_id_date_nb_page_folder

216 json_prefix_file = {}

217 if parse_prefix_file :

218 date_input = input["date"] if "date" in input else param_json["date"] if "date" in param_json else None

219 nb, id, date = parse_id_date_nb_page_folder(one_file_reference)

220 if id == 0:

221 id = param_json["id"] if "id" in param_json else 0

222 if nb == 0:

223 nb = len(list_pngs)

224 print(" date " + str(date) + " and id " + str(id) + " and nb " + str(nb) + " and file " + str(one_file_reference) + " and input keys " + str(input.keys()))

225 if date == None:

226 print("Using date_input as date " + str(date_input) + " and date " + str(date) + " and id " + str(id) + " and nb " + str(nb) + " and file " + str(one_file_reference) + " and input keys " + str(input.keys()))

227 if date_input == None:

228 date = datetime.datetime.now().strftime("%Y%m%d")

229 else :

230

231 if type(date_input) == str:

232 from lib.lib_util import parse_date

233 date, parsed_or_forced = parse_date(date_input, settings=None)

234 date = date.strftime("%Y%m%d")

235 else:

236 date = date_input.strftime("%Y%m%d")

237 json_prefix_file = {"nb" : nb,

238 "date" : date,

239 "id" : id}

240 filename_at = json_prefix_file["date"] if "date" in json_prefix_file else datetime.datetime.now().strftime("%Y%m%d 00:00:00")

241

242 from lib.lib_util import create_prefix_file_name_from_json_prefix

243 prefix_file = create_prefix_file_name_from_json_prefix(json_prefix_file)

244

245 print("keyword_to_parse_for_suivi_and_crash_id_file : " + str(prefix_file))

246 print("keyword_to_parse_for_suivi_and_crash_hit : " + str(hash_id_treatment))

247

248 if only_extract_page:

249 output = {"files" : list_pngs, "nb_page" : len(list_pngs)}

250 elif only_count:

251 output = {}

252 else :

253 map_file_size = {}

254 map_file_text = {}

255

256 begin_page = bool(param_json['begin_page']) if 'begin_page' in param_json else None

257 end_page = bool(param_json['end_page']) if 'end_page' in param_json else None

258 limit = param_json["limit"] if "limit" in param_json else 0

259 parallel = bool(param_json["parallel"]) if "parallel" in param_json else False

260 nb_thread = param_json["nb_thread"] if "nb_thread" in param_json else 10

261

262 folder_export_boxes = param_json['folder_export_boxes'] if 'folder_export_boxes' in param_json else ""

263

264 begin_page_txt = ""

265 end_page_txt = ""

266

267

268 complete_text = ""

269 list_page_content = []

270 list_page_content_text = []

271 file_output = prefix_file + "_" + input["hash_id_treatment"] if "hash_id_treatment" in input else prefix_file + "_" + str(uuid4())

272

273 request_used = input["preprompt"] if "preprompt" in input else None

274 vllm_model = param_json["vllm_model"] if "vllm_model" in param_json else None #"mistral-small3.1"

275

276 print(str(list_pngs))

277 print(" verbose : " + str(verbose))

278 print("About to parallel or not")

279 if parallel and len(list_pngs) > nb_thread:

280 print("WARNING : not implemented yet for parallel and more than nb_thread images")

281

282 if parallel and len(list_pngs) <= nb_thread:

283 from lib.datou.lib_parallel import multi_thread_image_read

284 map_pids_path, map_sdp, map_text = multi_thread_image_read(model, verbose, map_file_size, map_file_text,

285 folder_export_boxes, begin_page, end_page, file_output,

286 nb_thread=nb_thread, list_pngs=list_pngs,

287 layer_api = None, vllm_model = vllm_model)

288

289 for i in range(len(list_pngs)):

290 nb = i + 1

291 sdp = map_sdp[nb]

292 list_page_content.append(sdp)

293 list_page_content_text.append(sdp.content)

294

295 print(" Inside parallel ! ")

296 print(" map_text.keys() " + str(map_text.keys()))

297 else:

298 map_text = {}

299 count = 1

300 for f in list_pngs:

301 if limit > 0 and count > limit:

302 break

303 count, text, sdp = sub_func_read_ocr(f, count, model, verbose, map_file_size, map_file_text, folder_export_boxes,

304 begin_page, end_page, file_output, layer_api=layer_api,

305 vllm_model=vllm_model, request_used = request_used)

306

307 map_text[count] = text

308 list_page_content.append(sdp)

309 list_page_content_text.append(sdp.content)

310 complete_text += begin_page_txt + text + end_page_txt

311 count = count + 1

312 print("Outside parallel")

313 print(" map_text.keys() " + str(map_text.keys()))

314

315 print(" map_text.keys() " + str(map_text.keys()))

316 for page in map_text:

317 print(" size : " + str(len(map_text[page])))

318

319 if saxia_split_end_csv != "":

320 print(" list_of_list_pages should be just [range(1, len(list_pngs))]")

321 from lib.lib_util import build_list_of_list_from_split

322 list_of_list_of_pages = build_list_of_list_from_split(saxia_split_end_csv, len(list_pngs))

323

324 from lib.lib_util import create_transcript_group_of_pages

325 complete_texts = create_transcript_group_of_pages(list_of_list_of_pages, map_text)

326 if not multi_input and saxia_split_end_csv == "":

327 print(" We should have only one group of page here !")

328 if len(complete_texts) != 1:

329 print(" WARNING data will be ignored !")

330 complete_text = complete_texts[0]

331

332 # The next if and else has now been refactored in above (how do I check) TODO 11/9/24

333

334 # VR TODO refacto to be merge with the other leg of the condition VR 11/9/24

335 # VR TODO refacto TESTED IN ONE CASE, TO REMOVE ON 15/10/2024

336 # and remove count_per_doc

337 # if multi_input:

338 # complete_texts = []

339 # cum_id_page = count_per_doc[0]

340 # id_part = 0

341 # one_complete_text = ""

342 # for i in range(len(list_pngs)):

343 # nb = i + 1

344 # text = map_text[nb]

345 # if begin_page:

346 # begin_page_txt = "\n------\nBegin Page " + str(nb) + "\n------\n"

347 # if end_page:

348 # end_page_txt = "\n------\nEnd Page " + str(nb) + "\n------\n"

349 # one_complete_text += begin_page_txt + text + end_page_txt

350 #

351 # ## toute la logique suivante servait à ne pas avoir de list_of_list_of_pages, donc on va s'en passer à présent

352 # if i + 1 == cum_id_page:

353 # complete_texts.append(one_complete_text)

354 # one_complete_text = ""

355 # id_part += 1

356 # if id_part < len(count_per_doc):

357 # cum_id_page += count_per_doc[id_part]

358 # else:

359 # if i + 1 != len(list_pngs):

360 # print("Internal error count_per_doc")

361 # else:

362 # print("This is the end !")

363 # elif saxia_split_end_csv == "" :

364 # complete_text = ""

365 # # for i in range(len(list_pngs)):

366 # # nb = i + 1

367 # if len(list_of_list_of_pages) != 1:

368 # print("INTERNAL ERROR WHILE REFACTORING ! ")

369 # for nb in list_of_list_of_pages[0]:

370 # text = map_text[nb]

371 # if begin_page:

372 # begin_page_txt = "\n------\nBegin Page " + str(nb) + "\n------\n"

373 # if end_page:

374 # end_page_txt = "\n------\nEnd Page " + str(nb) + "\n------\n"

375 # complete_text += begin_page_txt + text + end_page_txt

376

377

378 # TODO VR 5-4-25 : this is for auto split : not used yet

379 if parse_date_test_before_own_datou_step:

380 from lib.lib_util import parse_date_test_before_own_datou_step

381 map_res_page_date = parse_date_test_before_own_datou_step(list_page_content)

382 print("TO USE and TEST or use when failing in load_tab")

383 else :

384 map_res_page_date = {}

385

386 # prefix_prompt_input = "Merci d'estimer une approximation basique de l'impact carbone des produits se trouvant ici, ainsi que leur nombre de calories et le prix si possible, meme de manière approximative et de la renvoyer sous forme de tableau avec pour colonnes : PRODUIT, CO2, CALORIES, PRIX :\n"

387

388 # VR TODO pas trop content de cela et non pas du tout 21-1-24

389 # remove defautl step hard-coded, can tolerate either in input from interface or with the defaut datou for jpg in input

390 prefix_prompt_input = input["preprompt"] if "preprompt" in input else "" #"Merci d'estimer l'impact carbone des produits se trouvant ici, ainsi que leur nombre de calories et le prix si possible, meme de manière approximative, ou incomplet ou que tu fasses un raisonnement ouvert pour estimer tu mettras n/c quand tu ne peux pas estimer et de la renvoyer sous forme de tableau avec pour colonnes : PRODUIT, CO2, CALORIES, PRIX :\n"

391 # TODO VR REFACTO : il faut aussi effacer les fichiers ou ailleurs

392

393 print("NIMP si estimer prefix_prompt_input : " + str(prefix_prompt_input)[:100])

394

395 ce.compute_cost_search("google_ocr", len(map_file_text))

396

397 data = [

398 {

399 "id": file_output,

400 "text": "\n".join(complete_texts)

401 }

402 ]

403

404 if multi_input or saxia_split_end_csv != "":

405 complete_text = complete_texts

406

407 print("begin_page complete_text : " + str(complete_text[:100]).replace("\n", "§§"))

408

409 # Si le preprompt est vide on pourrait aussi ne pas le mettre, mais la step prompt_gpt gère quand meme ce cas là, donc voila !

410 output = {"text" : complete_text, "preprompt" : prefix_prompt_input,

411 "json_to_save" : data,

412 "list_page_content" : list_page_content,

413 "list_page_content_text" : list_page_content_text,

414 # "map_file_size" : map_file_size, "map_file_text" : map_file_text,

415 "images": [f for f in list_pngs],

416 "paragraphs" : [p.list_blocks["paragraphs"] if "paragraphs" in p.list_blocks else [] for p in list_page_content],

417 "in_folder" : in_folder,

418 "work_folder_images" : work_folder_images,

419 "map_res_page_date" : map_res_page_date}

420

421 if saxia_all_doc_separated and use_split_complet:

422 print("TO TEST")

423 output["multi_input"] = True

424 output["text_only_for_meta_data_and_not_split"] = output["text"]

425 output["text"] = []

426

427 if parse_prefix_file:

428 output["prefix_file"] = json_prefix_file

429 output["id_file"] = prefix_file

430 output["nb_page"] = len(list_pngs)

431 output["filename_at"] = filename_at

432

433 output["input_file_available_at"] = created_at

434 output["size_file"] = size_file

435

436 if create_output_hit:

437 output["output_hit"] = prefix_file + "_" + input["hash_id_treatment"] if "hash_id_treatment" in input else prefix_file + "_" + str(uuid4())

439 return output

443def datou_safia_step_request_gpt(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerPrompt = None) -> dict :

444 list_input = ["preprompt", "text"] # TODO VR REFACTO ARCHI : vérifier les entrées et sorties

445 list_output = ["result", "request"] # TODO VR REFACTO : ce preprompt devrait etre fait ailleurs

446 list_param_json = ["openai_token", "gpt_model"] # TODO VR REFACTO : avoir une interface d'édition ou bien un context d'ou on récupèrera ces infos (mais ce context dépendra potentiellement du projet pour spécifier le type de modèle, ou d'une "instanciation" d'un datou)

447

448 if verbose :

449 print("Inside request gpt")

450

451 if "preprompt" in input and input["preprompt"] != "":

452 preprompt = input["preprompt"]

453 elif "preprompt" in param_json:

454 preprompt = param_json["preprompt"]

455 else :

456 preprompt = ""

457 print(" all keys input : " + str(input.keys()))

458 text = input["text"] if "text" in input else ""

459 multi_input = input["multi_input"] if "multi_input" in input else False

460 saxia_split_end_csv = input["saxia_split_end_csv"].replace(" ", "") if "saxia_split_end_csv" in input else ""

461

462 exec_if_true = param_json["exec_if_true"] if "exec_if_true" in param_json else True

463 if not exec_if_true or exec_if_true == {}:

464 print(" dont_exec_if_false is True, we skip the formatting step ")

465 return input

466 print("PAssed exec if true prompt step")

467

468 if type(text) == list:

469 # text = text[0]

470 print(" ERROR treated as WARNING : only the first text will be treated : " + str(text))

471 print("TODO multiple files not implemented yet !")

472 if multi_input or saxia_split_end_csv != "":

473 print(" Here we need to do something !")

474 texts = text

475 else :

476 print("As of 24/6/24 internal error")

477 else:

478 texts = [text]

479

480 model = ""

481 size_correct = True

482 nb_token = 0

483 result = ""

484 request = ""

485 if len(texts) == 0:

486 print("List empty of texts as input prompt !")

487 results = []

488 for text in texts:

489 print(" begin text begin_page " + str(text[:50].replace("\n", " ")))

490 request = preprompt + text

491

492 # TODO a virer car a ete injecter à la configuration

493 openai_token = param_json["openai_token"]

494 gpt_model = param_json["gpt_model"] if "gpt_model" in param_json else "gpt-4"

495

496 from lib.lib_util import check_and_truncate_query_max_token

497 size_correct, request_truncated = check_and_truncate_query_max_token(request)

498

499 request_used = request if size_correct else request_truncated

500

501 if verbose:

502 print("before call request_gpt")

503 try :

504 if layer_api == None:

505 (result, nb_token, model) = ("", 0, "")

506 else :

507 result, nb_token, model = layer_api.prompt(request_used, gpt_model, verbose = verbose)

508 except Exception as e:

509 print(str(e))

510 result, nb_token, model = "", 0, "ERROR IN PROMPT"

511

512 results.append(result)

513

514# from lib.lib_openai import request_gpt

515# result, nb_token, model = request_gpt(openai_token, request_used, gpt_model, verbose = verbose)

516 if verbose :

517 print("after request_gpt")

518 ce.compute_cost_search(model, nb_token)

519 if not size_correct:

520 print("WARNING TOO LONG QUERY ")

521 result = "Your query was too long and has been truncated :" + result

522

523 if multi_input or saxia_split_end_csv != "":

524 result = results

525

526 return {"result" : result, "request" : request}

527

528

529

530# send_mail

531def datou_safia_step_send_mail(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict :

532 list_input = ["request", "result", "file"]

533 list_output = ["result", "object"]

534 list_param_json = ["info_auth", "hash_id_treatment", "privacy", "from_mail_to_send"]

535

536 privacy = param_json["privacy"] if "privacy" in param_json else False

537 info_auth = param_json["info_auth"] if "info_auth" in param_json else None

538 from_mail_to_send = param_json["from_mail_to_send"] if "from_mail_to_send" in param_json else None

539 privacy = param_json["privacy"] if "privacy" in param_json else False

540 hash_id_treatment = param_json["hash_id_treatment"] if "hash_id_treatment" in param_json else input["hash_id_treatment"] if "hash_id_treatment" in input else None

541 send_mail = param_json["send_mail"] if "send_mail" in param_json else True

542 send_sms = param_json["send_sms"] if "send_sms" in param_json else False

543 send_slack = param_json["send_slack"] if "send_slack" in param_json else False

544 type_email = param_json["type_email"] if "type_email" in param_json else "plain_text"

545

546 # VR TODO : il faut sans doute rajouter un override (on peut utiliser assoc pour cela), mais à l'upload de fichier audio on s'attend à des comportement par defaut

547 result = param_json["result"] if "result" in param_json else input["result"] if "result" in input else ""

548 request = input["request"] if "request" in input else ""

549 file = input["file"] if "file" in input else ""

550 object = input["object"] if "object" in input else param_json["object"] if "object" in param_json else "Prompt request by email to Fotonower assistant APIA"

551

552 if type(object) != str:

553 print("object has not been converted to string, we will do it !")

554 object = str(object)

555

556 from auth.lib_conf_system import collect_version_from_datou_and_proj_and_app_recursively

557 version = input["version"] if "version" in input else collect_version_from_datou_and_proj_and_app_recursively()

558

559 if privacy :

560 privacy_footer = """

561 Privacy is ON, RGPD is strictly implemented and no data sent will be kept outside your email to address you issue and keep a record of your usage, please find more info here https://www.fotonower.com/fpa

562 Used hash is :

563 """ + str(hash_id_treatment)

564 else :

565 privacy_footer = """

566 Privacy is OFF, You can OPT-OUT by sending an email to dpo@fotonower.com with object : OPT-OUT FPA (Fotonower Prompt Assistant) : """ + str(hash_id_treatment) + " \n"

567

568 version_footer = "Generated with Safia " + version + " \n" + \

569 "More info on https://safia.app or https://saxia.app "

570

571 import logging

572 logger = logging.getLogger()

573 logger.info("before send_mail test GITVELOURS in os.environ ")

574 logger.info("before get_info_auth ")

575 if type_email == "html":

576 html = result

577 else:

578 html = str(info_auth) + str(result) + " \n" + privacy_footer + "\n" + version_footer

579 content_txt = str(info_auth) + "\n" + str(result) + "\n \n \n" + privacy_footer + "\n" + version_footer

580

581 # Step send mail => bien qu'on l'ai indiqué la dessous

582 from lib.lib_speechtotext import remove_extension

583 txt_file = remove_extension(file) + ".txt"

584 logger.info("After remove_extension ")

585 import os, sys

586 logger.info(os.getcwd())

587 logger.info("to write txt_file " + str(txt_file))

588 if file != "":

589 with open(txt_file, "w") as f:

590 f.write(request)

591 else :

592 shutil.rmtree(txt_file, ignore_errors=True)

593

594 logger.info("After write ")

595

596 if send_mail:

597 if "GITVELOURS" in os.environ :

598 logger.info("YES for GITVELOURS : " + str(os.environ["GITVELOURS"]))

599 pythonpathfotonower = os.path.join(os.environ["GITVELOURS"], "python")

600 sys.path.append(pythonpathfotonower)

601 logger.info("before import ses mailer ! ")

602 import mtr.ses_mailer

603 ses_mailer = mtr.ses_mailer.SesMailer()

604

605 logger.info("before get_from_mail_to_send ! ")

606 dest_mail_list = from_mail_to_send

607

608 logger.info("logger About to send email : " + str(dest_mail_list))

609

610 if "cc" in input:

611 dest_mail_list += "," + input["cc"]

612

613 print ("About to send email !")

614 if verbose :

615 print (" type html : " + str(type(html)))

616 print ("html : " + str(html))

617

618 sender = "assistant@fotonower.com"

619 try :

620 print(" Maybe type_email is useless 28/12/25 ")

621 print(" Sending " + sender + " to " + str(dest_mail_list))

622 if file != "":

623 ret = ses_mailer.send_email_with_attachment(sender, dest_mail_list,

624 object, body_html = html, file_path = txt_file, body_text = content_txt)

625 else :

626 html = result

627 ret = ses_mailer.send_html_email(sender, dest_mail_list, object, html, content_txt)

628 except Exception as e:

629 print(str(e))

630

631 if send_sms:

632 print("TODO send_sms")

633 if send_slack:

634 print("TODO send_slack")

635

636 return {"object" : object, "body" : result} # , "version" : version

637

638# git_action

639def datou_safia_step_git_action(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict :

640 list_input = ["request", "result"]

641 list_output = []

642 list_param_json = ["defaut_github_issue", "github_token", "privacy"]

643

644 privacy = param_json["privacy"]

645 defaut_github_issue = param_json["defaut_github_issue"]

646 github_token = param_json["github_token"]

647

648 request = input["request"]

649 result = input["result"]

650

651 if privacy:

652 print("With privacy enabled, we do not append info to any github issues so we continue !")

653 else :

654 own_repo_nb = defaut_github_issue.split("/")

655 if github_token != "" and len(own_repo_nb) == 3 and own_repo_nb[2].isdigit():

656 own_repo = own_repo_nb[0] + "/" + own_repo_nb[1]

657 issue_number = int(own_repo_nb[2])

658 from lib.lib_github import append_comment

659 message_comment_github = "[up](#up)\n\n" + result + "\n \n----\nMESSAGE BRUT\n------<details>\n\n" + request + "\n</details>"

660 append_comment(github_token, verbose = verbose,

661 message_comment = message_comment_github,

662 OwnRepo = own_repo,

663 issue_number = issue_number)

664 print("github message appened !")

665

666 output = {"log_git_action" : "git_action_done"}

667 return output

668

669

670

671# doc_to_json TO TEST

672def datou_safia_step_doc_to_json(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict :

673 list_input = ["file"]

674 list_output = ["json_path", "log_d2j"]

675 list_param_json = [] # managed_extension (for freemium ?)

676

677 file = input["file"]

678

679 # TODO VR : get list from somewhere els

680 managed_extension = param_json["managed_extension"].split("") if "managed_extension" in param_json else [".mp3", ".ogg", ".amr", ".m4u", ".wav", ".jpeg", ".jpg", ".png", ".pdf", ".txt", ".docx", ".json", ".py"]

681

682 from lib.lib_safia import safia_import

683 json_to_import_path, list_detailed_time_safia_import_to_json, sorted_dict_unmanaged_extension = safia_import(in_file=file, # managed_extension=managed_extension,

684 verbose = False)

685

686 output = {"json_path" : json_to_import_path, "log_d2j" : list_detailed_time_safia_import_to_json, "unmanaged_extension" : str(sorted_dict_unmanaged_extension)}

687 return output

688

689

690

691# import_json TO TEST

692def datou_safia_step_import_json(input : dict = {}, param_json : dict = {}, ce : CE = None,

693 verbose : bool = False, layer_api : LayerGeneric = None) -> dict :

694 list_input = ["json_path", "json_to_save"] # one of them

695 list_output = ["log_import"]

696 list_param_json = ["table_documents", "openai_token"]

697

698 json_path = input["json_path"] if "json_path" in input else ""

699 json_to_save = input["json_to_save"] if "json_to_save" in input else []

700

701 table_documents = param_json["table_documents"] if "table_documents" in param_json else ""

702 openai_token = param_json["openai_token"] if "openai_token" in param_json else ""

703

704 from lib.import_util.lib_import_retrieval.scripts.process_json.process_json import process_json_dump, process_json_dump_aux

705 import asyncio

706

707 # logger.info(" before process_json_dump : json_to_import_path : " + str(json_path))

708 try :

709 from server.safia import lpgss_singleton # VR to refacto with abstract classes ?

710 lpgss_singleton.get_admin_situation(verbose=verbose)

711 if json_path != "": # TODO better test existence ??

712 total_nb_token, used_model = asyncio.run(process_json_dump(json_path, {}, False, False, None, lpgss_singleton, openai_token, table_documents, verbose = verbose))

713 else :

714 total_nb_token, used_model = asyncio.run(process_json_dump_aux(json_to_save, {}, False, False, None, lpgss_singleton, openai_token, table_documents, verbose = verbose))

715 # result_json["log"] += " ,after process json to documents table : total_nb_token : " + str(total_nb_token)

716

717

718

719 except Exception as e:

720 import logging

721 logger = logging.getLogger()

722 logger.info(str(e))

723 print(str(e))

724 logger.info("Bug in datou_safia_step_import_json")

725 print("Bug in datou_safia_step_import_json")

726 used_model = "crashed"

727 total_nb_token = -1

728

729 ce.compute_cost_search(used_model, total_nb_token)

730

731 output = {"log_import_json" : "Inserted in file : " + str(json_path)}

732 return output

733

734

735

736def datou_safia_step_get_embedding(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict :

737 list_input = ["text"]

738 list_output = ["embedding"]

739 list_param_json = ["openai_token"]

740

741 text = input["text"]

742 openai_token = param_json["openai_token"]

743

744 # datou_step

745 from lib.lib_openai import get_embeddings

746 # embedding_model fixed for now

747 embedding_input = get_embeddings(text, openai_token, verbose = verbose)

748

749 info_context_exec = {"display_info" : {"embedding" : "delete", "info_context_exec" : "show"}}

750

751 output = {"embedding" : embedding_input, "info_context_exec" : info_context_exec}

752 return output

753

754

755

756# search_doc_NN TO TEST

757def datou_safia_step_append_to_doc_content(input : dict = {}, param_json : dict = {},

758 ce : CE = None, verbose : bool = False,

759 layer_api : LayerGeneric = None) -> dict :

760 list_input = ["result", "document_id", "project_id", "user_id"] # TODO VR je ne me rappele plus si les datous sont right_safe ou doivent vérifier les droits =>

761 # - [ ] TODO il faut déjà définir la terminologie

762 list_output = ["references"]

763 list_param_json = ["openai_token"] # TODO VR 15-6-23 : in_match_count optionnal => gérer les param_json optionnel

764

765 openai_token = param_json["openai_token"]

766

767 document_id = input["document_id"] if "document_id" in input else param_json["document_id"] if "document_id" in param_json else ""

768 project_id = input["project_id"] if "project_id" in input else param_json["project_id"] if "project_id" in param_json else 0

769 user_id = param_json["user_id"] if "user_id" in param_json else 0

770 result = input["result"] if "result" in input else input["text"] if "text" in input else ""

771

772 table_documents = param_json["table_documents"] if "table_documents" in param_json else "dummy_table_documents_no_access"

773

774 # VR TODO 4-12-23 : ca va etre difficile de rajouter la récupération du project_id, car les datous semblent contextualiser à un projet (mais c'est celui par défaut)

775 # Cependant je ne sais pas comment aller chercher lss dans un datou (je ne voulais pas cela à un moment donner, ben si, mais maintenant

776 # je veux faire des choses varié sur les projets (d'ailleurs avoir des projets d'entrée et d'autres de sortie/enregistrement)

777# has_access = lss.lib_right.get_role_on_project(lss.get_user_id(), project_id)

778 from datetime import datetime

779 today_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

780 from server.safia import lpgss_singleton, lib_right_singleton

781

782 # TODO VR : est-ce bien ce que l'on veut faire ? on ne veut pas plutot récupérer le lss dans le contexte d'execution (on aurait alors le user_id)

783 from lib.lib_safia_system import LibSafiaSystem

784 lss = LibSafiaSystem(lib_user_data_internal=lpgss_singleton, lib_right=lib_right_singleton)

785 lss.user_id = user_id # CA c'est un hack

786 content = lss.load_document(document_id, project_id, chunk_id=None, verbose = verbose)

787

788 new_content = content + "\n" + result

789

790# list_docs = lpgss_singleton.get_documents(table_documents, document_id, verbose = verbose)

791# if len(list_docs) != 1:

792# print(" Problem with documents to append ! ")

793

794# content = list_docs[0]["content"] + "\n" + result # TODO VR 4-12-23 : on pourrait aussi ajouter un tag pour dire que c'est un append

795

796 save_document_data = {"document_id" : document_id, "document_content" : new_content}

797 # TODO VR : a mon avis ce serait mieux d'abstraire cela (dans le layer machin et de ne pas avoir ce openai_token dans process_json ni autre !)

798 total_nb_token, used_model = lss.save_document(save_document_data, project_id, openai_token=openai_token)

799 ce.compute_cost_search(used_model, total_nb_token)

800

801 # TODO VR : à mon avis le result sera deja dedans vu qu'on append tout au fur et à mesure !

802 output = {"result" : result, "references" : [document_id]}

803 return output

804

805# search_doc_NN TO TEST

806def datou_safia_step_search_doc_NN(input : dict = {}, param_json : dict = {},

807 ce : CE = None, verbose : bool = False,

808 layer_api : LayerGeneric = None) -> dict :

809 list_input = ["embedding"]

810 list_output = []

811 list_param_json = ["match_page_sections", "in_match_count"] # TODO VR 15-6-23 : in_match_count optionnal => gérer les param_json optionnel, VR 4-12-23 mais n devrait-on pas plutot mettre ces infos dans input ?

812

813 match_page_sections = param_json["match_page_sections"]

814 in_match_count = param_json["in_match_count"] if "in_match_count" in param_json else 5

815 embedding = input["embedding"]

816 text = input["text"] if "text" in input else ""

817

818 from server.safia import lpgss_singleton

819

820 from lib.stockage.lib_pgvector import find_docs

821 result = find_docs(embedding, lpgss_singleton,

822 function = match_page_sections,

823 in_match_count = 5,

824 verbose = False)

825

826 preprompt = "Merci de repondre à la question à partir des documents et ne pas mentionné que tu es un chatbot sinon quelqu'un va mourir :"

827

828 request = preprompt + text + str(list(map(lambda x : x["document_id"] + " " + x["content"], result)))

829

830 list_document_ids = list(map(lambda x : x["id"], result)) # document_id is without the chunk id

831

832 print("request : " + str(request))

833

834 output = {"result" : result, "request" : request, "text" : request, "references" : list_document_ids}

835 return output

836

837

838

839# result_to_json

840def datou_safia_step_result_to_json(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict :

841 list_input = ["request", "result"]

842 list_output = []

843 list_param_json = ["user"]

844

845 print("TO USE TO CREATE NEW STEP")

846

847 user = param_json["user"] if "user" in param_json else None

848 if user == None:

849 print(" This show map_reduce doesn't pass correclty the complete_param_json and that this param doesn't have the user key because the mismatch between app_param, user_param, exec_param, step_param is not well defined ! ")

850 return {}

851

852 prefix = param_json["prefix"] if "prefix" in param_json else ""

853

854 from lib.lib_util import replace_non_alpha_with_underscore

855

856 if user == None:

857 user = replace_non_alpha_with_underscore("anonymous@opio.fr")

858 else :

859 user = "0.0.0.0"

860

861 from datetime import datetime

862 curr = datetime.now().strftime("%Y%m%d%H%M%S%f")

863 if prefix == None:

864 name = user + "_" + curr

865 else :

866 name = prefix + user + "_" + curr

867

868 total_nb_token = 0

869 used_model = ""

870 references = []

871 if "request" in input and "result" in input:

872 request = input["request"]

873 result = input["result"]

874

875 id_request = "///UPLOAD//REQUEST//" + name

876 id_result = "///UPLOAD//RESULT//" + name

877 references = [id_request, id_result]

878

879 # TODO VR : here we should parse title in result and set in request and result ?

880 # TODO VR : how should hostname be configured, in context_process_server ?

881 # TODO VR Il faut supprimer les fichiers temporaire et garder que ceux que l'on veut du cdn

882

883 list_reference = "\n## No references\n"

884 if "references" in input:

885 list_reference = "\n## References Internal and External\n"

886 r = 0

887 for ref in input["references"]:

888 if ref.endswith(".ogg"):

889 ref = ref.replace(".ogg", ".mp3")

890 list_reference += "Ref " + str(r) + " : " + ref + "\n"

891 r = r + 1

892

893 data = [

894 {

895 "id" : id_request,

896 "text" : request

897 },

898 {

899 "id" : id_result,

900 "text" : result + list_reference

901 }

902 ]

903 else :

904 data = []

905

906 output = {"json_to_save" : data, "references" : references}

907 return output

908

909

910

911# load_graph => to debug TODO VR to debug

912def datou_safia_step_load_existing_graph(input : dict = {}, param_json : dict = {},

913 ce : CE = None, verbose : bool = False) -> dict : # pragma no cover icebox

914 list_input = [] # TODO VR ajouter file

915 list_output = ["preprompt"] #

916 list_param_json = [] # TODO VR REFACTO

917

918 print("TO define")

919 # TODO VR 14-6 : comment construire des prompt en plusieurs bout ?

920 # VR ca n'a pas encore marché !

921 #if object == "edit_graph":

922 from lib.lib_graph import read_graph

923 input_graph = read_graph(temp_dir = "static/temp/graph",

924 graph_name = "graph",

925 verbose = verbose)

926 output = {"preprompt" : "\nEn prenant comme graph de départ celui-ci :\n" + input_graph}

927 return output

928

929def datou_safia_step_load_url_content_text(input: dict = {},

930 param_json: dict = {},

931 ce: CE = None,

932 verbose: bool = False,

933 layer_api : LayerGeneric = None) -> dict:

934 list_input = ["url"]

935 list_output = ["content"]

936 list_param_json = []

937

938 import requests

939 from bs4 import BeautifulSoup

940

941 # L'URL de la page dont vous voulez extraire le texte

942 url = input["url"] if "url" in input else "https://www.fotonower.com/"

943

944 print("url : " + str(url))

945

946 try:

947 # Faire une requête GET à l'URL

948 response = requests.get(url)

949 except Exception as e:

950 print(str(e))

951 return {"content": "Error in request" + str(e)}

952

953 # Analyser le contenu HTML de la page avec Beautiful Soup

954 soup = BeautifulSoup(response.text, 'html.parser')

955

956 # Extraire tout le texte de la page

957 texte_page = soup.get_text(separator=' ', strip=True)

958 texte_page = texte_page.replace("|", " ")

959 texte_page = texte_page.replace("\n", " ")

960 texte_page = texte_page.replace("\r", " ")

961

962 # Afficher le texte de la page

963 if verbose:

964 print(texte_page)

965

966 output = {"content": texte_page}

967 return output

968

969def aux_map_reduce_loop(list_texts, res_json_field, aux_input_var, list_steps, list_param_json_steps, param_json,

970 verbose, privacy, with_audit, strat_reduce, reduced_result, list_audit_map_reduce,

971 hit_main_datou_step_map_reduce = "to_be_passed_as_argument", id_step_incomplete_args = None):

972 for textbout in list_texts:

973 if verbose:

974 print("text : " + str(textbout))

975 print(" hit_main from map_reduce : " + hit_main_datou_step_map_reduce)

976 from lib.datou.datou_exec import datou_exec

977 input_datou = textbout if strat_reduce == "run_one_datou" else {aux_input_var: textbout}

978

979 # TODO a nettoyer : On ne force rien pour l'instant VR 15-1-25

980 with_audit_save_var = with_audit

981 #with_audit = False

982 if 'with_audit' in input_datou:

983 with_audit_save = input_datou['with_audit']

984 # input_datou['with_audit'] = False

985

986 output, audit_json = datou_exec(list_steps, input=input_datou,

987 complete_param_json=param_json, verbose=verbose,

988 privacy=privacy, list_param_json_steps=list_param_json_steps,

989 with_audit=with_audit, id_step_incomplete_args = id_step_incomplete_args)

990

991 result = output if strat_reduce == "run_one_datou" else output[res_json_field] if res_json_field in output else ""

992 if with_audit:

993 if verbose:

994 print("audit_json : " + str(audit_json))

995 list_audit_map_reduce.append(audit_json)

996

997 if 'with_audit' in input_datou:

998 try :

999 input_datou['with_audit'] = with_audit_save

1000 except Exception as e:

1001 print(" Incompréhensible que cela ne fonctionne pas")

1002 with_audit = with_audit_save_var

1003

1004 # TODO TO TEST VR 26-1-24 : and add image also ?!?

1005 # df[res_json_field] = result

1006

1007 if strat_reduce == "concat":

1008 reduced_result += result

1009 elif strat_reduce == "append_page":

1010 print(

1011 "Thanks object (sub_page_doc) that are reference in python, the result is already at its correct position")

1012 elif strat_reduce == "run_one_datou":

1013 reduced_result = result

1014 reduced_result["hit_internal"] = "tofind"

1015 else:

1016 print("Unsupported : strat_reduce : " + str(strat_reduce))

1017

1018 return reduced_result

1019

1020def datou_safia_step_map_reduce(input : dict = {},

1021 param_json : dict = {},

1022 ce : CE = None,

1023 verbose : bool = False,

1024 layer_api : LayerGeneric = None) -> dict :

1025 list_input = ["text", "datou_int_id", "strat_reduce", "param.size", "param.overlap", "res_json_field"]

1026 # TODO update sql input

1027 list_output = []

1028 list_param_json = []

1029 with_audit = input["with_audit"] if "with_audit" in input else False

1030

1031# "text", "datou_int_id", "strat_reduce", "param.size", "param.overlap"

1032

1033 text = input["text"] if "text" in input else ""

1034 list_page_content = input["list_page_content"] if "list_page_content" in input else []

1035 list_page_content_text = input["list_page_content_text"] if "list_page_content_text" in input else []

1036 paragraphs = input["paragraphs"] if "paragraphs" in input else []

1037 id_step_incomplete_args = param_json["id_step_incomplete_args"] if "id_step_incomplete_args" in param_json else None

1038

1039 # VR 19-5 : je ne sais plus pourquoi j'ai besoin de cherche dans le input

1040 datou_int_id = input["datou_int_id"] if "datou_int_id" in input else param_json["datou_int_id"] if "datou_int_id" in param_json else -1

1041 strat_reduce = param_json["strat_reduce"] if "strat_reduce" in param_json else "concat"

1042 param = param_json["param"] if "param" in param_json else {"size" : 10000, "overlap" : 1000}

1043 res_json_field = input["res_json_field"] if "res_json_field" in input else "result"

1044 list_page_per_doc = input["list_page_per_doc"] if "list_page_per_doc" in input else param["list_page_per_doc"] if "list_page_per_doc" in param else 0 # VR 17-5 TODO param c'est fias pour débugguer à mon avis

1045

1046 parallel = bool(param_json["parallel"]) if "parallel" in param_json else False

1047 nb_thread = param_json["nb_thread"] if "nb_thread" in param_json else 10

1048

1049 if list_page_per_doc == 0:

1050 print("CHECK ERROR using default list of page not grouped by document, maybe normal if it is the first map_reduce where we try to classify : datou_int_id : " + str(datou_int_id))

1051 if len(list_page_content_text) != len(list_page_content):

1052 print("ERROR migration datou_exec_partial_data_json")

1053 list_page_per_doc = ";".join(list(map(str, range(1, len(list_page_content_text) + 1))))

1054

1055 from lib.lib_util import split_text, split_text_by_doc, split_list_page_by_doc, split_list_page_by_page

1056 aux_input_var = "text"

1057 curr_datou_id = None

1058 if strat_reduce == "concat":

1059 print("concat")

1060

1061 # En fait c'est le texte complet (on split en BEGIN END ? GRRR)

1062 # This case is for old version generated before 30/6/25 when we want to do partial exec for stat study !

1063 if list_page_content == []:

1064 from lib.lib_util import managing_deprecated_input_text_concat_into_list

1065 list_texts = managing_deprecated_input_text_concat_into_list(text, list_page_per_doc)

1066

1067 else:

1068 if text != None and (len(list_page_per_doc) == len(text) or (len(list_page_per_doc) == 1 and type(text) == str)):

1069 print("NEVER CORRECT OR USELESS ANYWAY We expect list_text to be the same object, we force it when length is one (ie ONE DOC) : VR 3-6-25")

1070 # if type(text) == str:

1071 # list_texts = [text]

1072

1073 try:

1074 list_texts = split_text_by_doc(list_page_content_text, list_page_per_doc)

1075 except Exception as e:

1076 print("l 1038 If only one doc it could run !, and maybe alos with multiple doc by the way")

1077 print(str(e))

1078 elif strat_reduce == "append_page":

1079 list_texts = split_list_page_by_page(paragraphs)

1080 aux_input_var = "paragraphs"

1081 elif strat_reduce == "append_doc":

1082 if text != None and (len(list_page_per_doc) == len(text) or (len(list_page_per_doc) == 1 and type(text) == str)):

1083 print("We expect list_text to be the same object, we force it when length is one (ie ONE DOC) : VR 3-6-25")

1084 if type(text) == str:

1085 list_texts = [text]

1086 try:

1087 list_texts = split_list_page_by_doc(list_page_content_text, list_page_per_doc)

1088 except Exception as e:

1089 print("l 1051 If only one doc it could run !, and maybe also with multiple doc by the way")

1090 print(str(e))

1091 aux_input_var = "list_page_content"

1092 elif strat_reduce == "concat_stride": # TODO add test for these arguments

1093 list_texts = split_text(text, param["size"], param["overlap"])

1094 elif strat_reduce == "run_one_datou":

1095 # default behavior set internal datou id for run_one_datou

1096 if "datou_exec_info" in input and "mtr_datou_id" in input["datou_exec_info"]:

1097 curr_datou_id = input["datou_exec_info"]["mtr_datou_id"]

1098 input["datou_exec_info"]["mtr_datou_id"] = datou_int_id

1099# VR42 : toadd, sans doute seulement si with_uadi ou si on a datou_exed_info => et on en aurait besoin tout le temps à mon avis !

1100 if "datou_exec_info" in input:

1101 input["datou_exec_info"]["mtr_datou_id"] = datou_int_id

1102 list_texts = [input]

1103 else :

1104 print("Unsupported : strat_reduce : " + str(strat_reduce))

1105

1106 # user HACK right on datou

1107 from server.safia import lpgss_singleton, lib_right_singleton

1108

1109 # TODO VR : est-ce bien ce que l'on veut faire ? on ne veut pas plutot récupérer le lss dans le contexte d'execution (on aurait alors le user_id)

1110 from lib.lib_safia_system import LibSafiaSystem

1111 lss = LibSafiaSystem(lib_user_data_internal=lpgss_singleton, lib_right=lib_right_singleton)

1112 lss.user_id = 1 # CA c'est un hack 1106

1113 project_id = input["project_id"] if "project_id" in input else None

1114 datous = lss.get_datou(datou_int_id, project_id=project_id)

1115

1116 datou = None

1117 for d in datous:

1118 if d["id"] == datou_int_id:

1119 datou = d

1120 break

1121

1122 if datou == None:

1123 print("Datou unavailable " + str(datou_int_id))

1124 return {"text" : "Datou unavailable " + str(datou_int_id)}

1125

1126 import pandas as pd

1127 if "df" in input and type(input["df"]) == type(pd.DataFrame):

1128 df = input["df"] if "df" in input else None

1129# print("len list_texts : " + str(len(list_texts)))

1130 print("len(df.values) : " + str(len(df.values)))

1131

1132 reduced_result = None

1133 if strat_reduce == "concat":

1134 reduced_result = ""

1135 else :

1136 print("NOT needed (message isn't correct 16-1-25 ) Unsupported : strat_reduce : " + str(strat_reduce))

1137 reduced_result = None

1138

1139 list_audit_map_reduce = []

1140 id_intern_map = 0

1141

1142 hit_main = input["hash_id_treatment"] if "hash_id_treatment" in input else "hit_unknown"

1143

1144 privacy = True

1145 list_param_json_steps = list(map(lambda x: x["param_json"], datou["steps"]))

1146 list_steps = list(map(lambda x: x["name"], datou["steps"]))

1147 if parallel :

1148 print("PALAFI")

1149 from lib.datou.lib_parallel import datou_parallel_map_reduce

1150 reduced_result, list_audit_map_reduce = datou_parallel_map_reduce(list_texts, res_json_field, aux_input_var,

1151 list_steps, list_param_json_steps, param_json,

1152 verbose, privacy, with_audit, strat_reduce,

1153 nb_thread, hit_main, id_step_incomplete_args)

1154 print("PALAVI")

1155 else:

1156 # input

1157 # list_texts, res_json_field, aux_input_var, list_steps, list_param_json_steps, param_json, verbose, privacy, with_audit, strat_reduce, list_audit_map_reduce

1158 # output

1159 # reduced_result, list_audit_map_reduce ??

1160 reduced_result = aux_map_reduce_loop(list_texts, res_json_field, aux_input_var, list_steps, list_param_json_steps,

1161 param_json,

1162 verbose, privacy, with_audit, strat_reduce, reduced_result, list_audit_map_reduce, hit_main, id_step_incomplete_args)

1163

1164 document_safia_id = "map_reduce_" + (input["hash_id_treatment"] if "hash_id_treatment" in input else str(uuid4()))

1165

1166 data = [

1167 {

1168 "id": document_safia_id,

1169 "text": reduced_result

1170 }

1171 ]

1172

1173 if curr_datou_id != None:

1174 input["datou_exec_info"]["mtr_datou_id"] = curr_datou_id

1175 output = {"text" : reduced_result, "json_to_save" : data, "list_audit_map_reduce" : list_audit_map_reduce}

1176 if output["text"] == None:

1177 del output["text"]

1178 return output

1179

1180

1181def datou_safia_step_load_tab(input : dict = {},

1182 param_json : dict = {},

1183 ce : CE = None,

1184 verbose : bool = False,

1185 layer_api : LayerGeneric = None) -> dict :

1186 list_input = ["text"]

1187 list_output = ["df"] # and others custom

1188 list_param_json = ["col_to_input"]

1189

1190 col_to_input = param_json["col_to_input"] if "col_to_input" in param_json else []

1191 output_df = param_json["output_df"] if "output_df" in param_json else "df"

1192 merge = param_json["merge"] if "merge" in param_json else None

1193 multi_input = input["multi_input"] if "multi_input" in input else False

1194 aggregate_multi_input = input["aggregate_multi_input"] if "aggregate_multi_input" in input else False

1195 saxia_split_end_csv = input["saxia_split_end_csv"].replace(" ", "") if "saxia_split_end_csv" in input else ""

1196

1197 max_nb_try = param_json["max_nb_try"] if "max_nb_try" in param_json else 3

1198 trigger = param_json["trigger"] if "trigger" in param_json else "nb_pages_80"

1199 retry_step_id = param_json["retry_step_id"] if "retry_step_id" in param_json else 2

1200 to_be_used = param_json["to_be_used"] if "to_be_used" in param_json else "split_at_10"

1201

1202 # markdown, json, auto_detect ??

1203 format_input = param_json["format_input"] if "format_input" in param_json else "markdown"

1204

1205 {"nb_try": 3, "trigger": "nb_pages_80", "retry_step_id": 2, "to_be_used": "split_at_10"}

1206

1207 saxia_all_doc_separated = input["saxia_all_doc_separated"] if "saxia_all_doc_separated" in input else False

1208

1209 nb_page = input["nb_page"] if "nb_page" in input else 0

1210

1211 if saxia_all_doc_separated:

1212 print("Need to use only saxia_split_end_csv : " + str(saxia_split_end_csv))

1213 print("We should force format_input to json")

1214 input["result"] = []

1215 format_input = "json_as_dict"

1216 id_page_id = 0

1217 if saxia_split_end_csv != "":

1218 saxia_split_end_csv += ","

1219 saxia_split_end_csv += str(nb_page)

1220 for list_of_page_read in saxia_split_end_csv.split(","):

1221 list_of_page_read_int = int(list_of_page_read)

1222 list_of_page_list = list(range(id_page_id + 1, list_of_page_read_int + 1))

1223 id_page_id = list_of_page_read_int

1224 list_of_page_csv = ",".join(list(map(str, list_of_page_list)))

1225 one_list_of_page = {"Liste des pages": [list_of_page_csv]}

1226 # Titre

1227 # Nombre de pages

1228 # Commentaires

1229 # document_type

1230 input["result"].append(one_list_of_page)

1231

1232 if id_page_id != nb_page:

1233 print("ERROR : saxia_split_end_csv : " + str(saxia_split_end_csv) + " and nb_page : " + str(nb_page))

1234

1235 # find function used in datou_batch to maje df from text

1236 from lib.batch.lib_batch import create_pandas_table_from_text

1237 from lib.lib_util import parse_json_from_prompt_result

1238

1239 result = input["result"]# if "result" in input else ""

1240 # Il faut vérifier que result = text fasse planter les tests

1241 if type(result) == list:

1242 #result = result[0]

1243 print(" ERROR treated as WARNING : only the first file will be treated : " + str(result))

1244 print("TODO multiple files not implemented yet !")

1245 if multi_input:

1246 if aggregate_multi_input:

1247 # result = " ".join(result)

1248 print("We will need to do something !")

1249 else:

1250 print("internal ERROR : multiple files not implemented yet !")

1251

1252 import pandas as pd

1253 # todo vr 27-12-23 normaliser les input et utilisation des assoc

1254 if multi_input or saxia_split_end_csv != "":

1255 print(" For now aggregate_multi_input is set to True by default in case of multi input")

1256 complete_df = None

1257 for res in result :

1258 if format_input == "markdown":

1259 df = create_pandas_table_from_text(res, verbose = verbose) if "result" in input else pd.DataFrame(columns=["Init"])

1260 elif format_input == "json":

1261 df = pd.DataFrame(parse_json_from_prompt_result(res))

1262 elif format_input == "json_as_dict":

1263 df = pd.DataFrame(res)

1264 else:

1265 print("format_input " + format_input + " not implemented yet ! ")

1266 if type(complete_df) == types.NoneType:

1267 complete_df = df

1268 else:

1269 complete_df = pd.concat([complete_df, df], axis=0, ignore_index=True)

1270 df = complete_df

1271 else:

1272 if format_input == "markdown":

1273 df = create_pandas_table_from_text(result, verbose = verbose) if "result" in input else pd.DataFrame(columns=["Init"])

1274 elif format_input == "json":

1275 df = pd.DataFrame(parse_json_from_prompt_result(result))

1276 else:

1277 print("format_input " + format_input + " not implemented yet ! ")

1278

1279 output = {output_df : df}

1280 for col in col_to_input:

1281 strat = col["strat"] if "strat" in col else "concat_sccsv"

1282 col_name = col["col_name"] if "col_name" in col else None

1283 input_name = col["input_name"] if "input_name" in col else None

1284 all_data_to_clean = list(df[col_name]) if type(df) == pd.DataFrame and col_name in df.columns else []

1285 # TODO better car mieux vaut quick and dirty que de ne pas avancer, vraisemblablement VR 20-12-23

1286 try:

1287 all_data = [d.replace(" ", "") for d in all_data_to_clean] # .replace("-", ",") = VR 14-2-24 a present gérer par le parsing begin_end manuelle avec 3 etats

1288 except :

1289 all_data = all_data_to_clean

1290 io_data_datou = ";".join(all_data)

1291 if input_name :

1292 output[input_name] = io_data_datou

1293

1294 if merge != None or trigger == "nb_pages_80":

1295 df_by_page = df

1296 df_by_document = df # input["df"] if "df" in input else None

1297 # Faire une boucle sur les documents et les pages pour vérifier que c'est correct

1298 # TODO VR 27-12-23 : faire un test pour vérifier que c'est correct

1299 print(" In datou_safia_step_load_tab ")

1300 try:

1301 res1 = df_by_document.to_markdown()

1302 res2 = df_by_page.to_markdown()

1303 res3 = df_by_document.to_json()

1304 res4 = df_by_page.to_json()

1305 if verbose :

1306 print(res1)

1307 print(res2)

1308 print(res3)

1309 print(res4)

1310 except Exception as e:

1311 print(str(e))

1312 try:

1313 # La liste des pages dans df_by_document a pour nom de colonne "Liste des pages"

1314 # L'information dans df_by_page sur le fait d'etre le debut ou la fin d'un document est dans la colonne Information_debut_fin

1315 # Pour chaque ligne de df_by_document, on va chercher les pages correspondantes dans df_by_page et vérifier qu'il n'y a qu'une seule page debut et fin de document sinon on renvoie un mesage d'erreur

1316 # Merci de faire une boucle sur df_by_document et pour chaque ligne de df_by_document, on va chercher les pages correspondantes dans df_by_page et vérifier qu'il n'y a qu'une seule page debut et fin de document sinon on renvoie un mesage d'erreur

1317 list_all_page = []

1318 for index, row in df_by_document.iterrows():

1319 liste_des_pages = row["Liste des pages"]

1320 from lib.lib_util import parse_list_page_as_begin_end_separated

1321 try:

1322 list_page_one_document = list(map(int, liste_des_pages.split(",")))

1323 except Exception as e:

1324 print("Trying to parse list_page_per_doc as begin and end separated : " + str(e))

1325 list_page_one_document = parse_list_page_as_begin_end_separated(liste_des_pages)

1326 print("list_page_one_document : " + str(list_page_one_document))

1327 if len(list_page_one_document) == 0:

1328 print("ERROR")

1329

1330 list_all_page.extend(list_page_one_document)

1331 if False:

1332 list_info_debut_fin = []

1333 for page_nb in list_page_one_document:

1334 df_by_page_one_page = df_by_page[df_by_page["Numéro de La Page"] == page_nb]

1335 Information_debut_fin = df_by_page_one_page["Information_debut_fin"].values

1336 list_info_debut_fin.append(Information_debut_fin)

1337 # Maintenant on compte les multiples

1338 count_begin = list_info_debut_fin.count(lambda x : "debut" in x.lower())

1339 count_end = list_info_debut_fin.count(lambda x : "fin" in x.lower())

1340 if count_begin > 1 or count_end > 1:

1341 print("ERROR OR WARNING multiple debut fin : " + str(list_info_debut_fin))

1342

1343 # On va vérifier que toutes les pages de df_by_page sont bien dans df_by_document sinon on les rajoute séparément comme un document à chaque fois

1344 if list_all_page != list(set(list_all_page)):

1345 print("ERROR OR WARNING Multiple page : " + str(list_all_page))

1346

1347 if set(range(1, nb_page)) != set(list_all_page): # nb_page

1348 print("ALL page : " + str(list_all_page))

1349 list_missing_page = list(set(range(1, nb_page)) - set(list_all_page))

1350 for missing_page in list_missing_page:

1351 import pandas as pd

1352 df_by_document = pd.concat([df_by_document, pd.DataFrame({"Liste des pages" : str(missing_page), "document_type" : "Added for completion"}, index=[len(df_by_document)])], ignore_index=True)

1353

1354 if trigger == "nb_pages_80":

1355 if len(list_missing_page) > 0.2 * float(nb_page):

1356 output["retry"] = True

1357 output["retry_step_id"] = retry_step_id

1358 output["max_nb_try"] = max_nb_try

1359 print("TRIGGER RETRY IN LOAD TAB")

1360 if to_be_used == "split_at_10":

1361 print("Ici on pourrait reconstuirer les texts de input selon un split pris en entrée ou bien un split par défaut par exemple toutes les 10 ou 15 pages")

1362 print("OUI ! Ici on pourrait reconstuirer les texts de input selon un split pris en entrée ou bien un split par défaut par exemple toutes les 10 ou 15 pages")

1363

1364 except Exception as e:

1365 if trigger == "nb_pages_80":

1366 output["retry"] = True

1367 output["retry_step_id"] = retry_step_id

1368 output["max_nb_try"] = max_nb_try

1369 print("ERROR IN TRIGGER RETRY IN LOAD TAB")

1370 print(str(e))

1371

1372 # On va vérifier que l'ordre des pages d'un sous-document n'est pas modifié sinon on renvoie un message

1373

1374# if trigger == "nb_pages_80":

1375

1376# print("ERROR OR WARNING : nb_pages_80 : " + str(len(df)))

1377

1378 if saxia_all_doc_separated:

1379 print("TO TEST !")

1380 if "result" in input:

1381 print("WARNING saxia_all_doc_separated Not expected ! ")

1382 else :

1383 input["result"] = ""

1384 if "text_only_for_meta_data_and_not_split" in input:

1385 input["text"] = input["text_only_for_meta_data_and_not_split"]

1386 else:

1387 print("All split csv should be tested more carefully, missing key text_only_for_meta_data_and_not_split")

1388 del input["text_only_for_meta_data_and_not_split"]

1389

1390 if type(output[output_df]) != types.NoneType:

1391 try:

1392 output[output_df]["prediag"] = "MISSING"

1393 if "prediag_csv" in input and input["prediag_csv"] != "" and input["prediag_csv"] != None:

1394 print("We can add prediag in df but we need list_of_list_of_file")

1395 # ca ca ne va pas : list_page_per_doc

1396# list_page_per_doc = input["list_page_per_doc"] if "list_page_per_doc" in input else ";".join(list(map(str, list(range(1, nb_page + 1)))))

1397# list_of_list_of_page_per_doc = list(map(lambda x: list(map(int, x.split(","))), list_page_per_doc.split(";")))

1398

1399 prediag_csv = input["prediag_csv"]

1400 list_prediag = prediag_csv.split(",")

1401 if len(list_prediag) == nb_page:

1402 # iterer sur le df et pour chaque ligne, on va chercher la page correspondante dans list_page_per_doc

1403 for index, row in output[output_df].iterrows():

1404 liste_des_pages = row["Liste des pages"]

1405 list_page_one_document = []

1406 if liste_des_pages != '':

1407 list_page_one_document = list(map(int, liste_des_pages.split(",")))

1408 # On va chercher la page correspondante dans list_page_per_doc

1409 sub_prediag_csv = ""

1410 for page_nb in list_page_one_document:

1411 if page_nb - 1 < len(list_prediag):

1412 if sub_prediag_csv != "":

1413 sub_prediag_csv += ","

1414 sub_prediag_csv += list_prediag[page_nb - 1]

1415 else:

1416 print("PROBLEMA CHECK !")

1417 output[output_df].loc[index, "prediag"] = sub_prediag_csv

1418 else:

1419 print("PROBLEMB CHECK : " + str(nb_page) + " len(list_prediag) : " + str(len(list_prediag)))

1420 else :

1421 print("MISSING PREDIAG PROBLEMC CHECK !, TO DO PLEASE")

1422 except Exception as e:

1423 print(str(e))

1424 else:

1425 print("ERROR CHECK : df is None")

1426

1427 print("END OF LOAD TAB")

1428 return output

1429

1430

1431

1432# Keep for easy implementation of new function (remove pragma no cover and complete all)

1433# INSERT INTO mtrdatou.datou_step_template (name, function_name, param_json_list, input_list, output_list) VALUES ('anon', 'datou_safia_step_anon', '["openai_token"]', '["list_page_content"]', '["result"]');

1434def datou_safia_step_anon(input : dict = {},

1435 param_json : dict = {},

1436 ce : CE = None,

1437 verbose : bool = False,

1438 layer_api : LayerGeneric = None) -> dict :

1439 list_input = []

1440 list_output = []

1441 list_param_json = []

1442

1443 keyword = param_json["keyword"] if "keyword" in param_json else {}

1444 exclude_word_split = param_json["exclude_word_split"] if "exclude_word_split" in param_json else []

1445 word_to_keep = param_json["word_to_keep"] if "word_to_keep" in param_json else []

1446 exclude_pers = param_json["exclude_pers"] if "exclude_pers" in param_json else ["docteur"]

1447 exclude_bib_start = param_json["exclude_bib_start"] if "exclude_bib_start" in param_json else ["docteur ", "dr. ", "dr "]

1448 append_detected_to_output = bool(param_json["append_detected_to_output"]) if "append_detected_to_output" in param_json else False

1449 anon_all_unread = bool(param_json["anon_all_unread"]) if "anon_all_unread" in param_json else False

1450 # list of keys for which we remove the info in order to make anonymisation

1451 remove_search_string_for_key_private_data = param_json["remove_search_string_for_key_private_data"] if "remove_search_string_for_key_private_data" in param_json else []

1452

1453 result_info_to_anon = input["result"] if "result" in input else ""

1454 list_page_content = input["list_page_content"] if "list_page_content" in input else []

1455

1456 all_paragraphs = param_json["data"]["par"] if "data" in param_json and "par" in param_json["data"] else []

1457 paragraphs_to_anon = []

1458 for one_page in all_paragraphs:

1459 one_page_to_anon = []

1460 for p in one_page:

1461 if "class" in p and p["class"] != "content":

1462 p["xmin"] = p["x"]

1463 p["ymin"] = p["y"]

1464 p["xmax"] = p["x"] + p["w"]

1465 p["ymax"] = p["y"] + p["h"]

1466 p["old_text"] = p["text"]

1467 p["text"] = "KKK"

1468 one_page_to_anon.append(p)

1469 paragraphs_to_anon.append(one_page_to_anon)

1470

1471 hash_id_treatment = input["hash_id_treatment"] if "hash_id_treatment" in input else str(uuid4())

1472 out_folder = input["out_folder"] if "out_folder" in input else None # "static/temp/anon"

1473# out_folder = None

1474

1475 from lib.anon.lib_anon import anon_document

1476

1477 from lib.lib_util import create_prefix_file_name_from_json_prefix

1478 prefix_file = create_prefix_file_name_from_json_prefix(input["prefix_file"]) if "prefix_file" in input else ""

1479 paragraphs_to_anon_copy = paragraphs_to_anon.copy()

1480 list_pngs, pdf_anon, json_info_to_anon = anon_document(result_info_to_anon, list_page_content, verbose=False,

1481 keyword = keyword, hash_id_treatment=hash_id_treatment,

1482 prefix_file = prefix_file,

1483 word_to_keep = word_to_keep,

1484 exclude_word_split=exclude_word_split,

1485 anon_all_unread=anon_all_unread,

1486 remove_search_string_for_key_private_data=remove_search_string_for_key_private_data,

1487 exclude_pers = exclude_pers,

1488 exclude_bib_start = exclude_bib_start,

1489 out_folder=out_folder,

1490 paragraphs_to_anon=paragraphs_to_anon_copy)

1491

1492 if verbose:

1493 print(" pdf_anon : " + str(pdf_anon))

1494

1495 output = {"pdf_anon" : pdf_anon}

1496

1497 if append_detected_to_output:

1498 for key in json_info_to_anon:

1499 if type(json_info_to_anon[key]) == list:

1500 json_info_to_anon[key] = ";".join(json_info_to_anon[key])

1501 output.update(json_info_to_anon)

1503 return output

1508# INSERT INTO mtrdatou.datou_step_template (name, function_name, param_json_list, input_list, output_list) VALUES ('format', 'datou_safia_step_format', '["format_info"]', '["result"]', '["result"]');

1509def datou_safia_step_format(input : dict = {},

1510 param_json : dict = {},

1511 ce : CE = None,

1512 verbose : bool = False,

1513 layer_api : LayerGeneric = None) -> dict :

1514 list_input = []

1515 list_output = []

1516 list_param_json = []

1517 config_project = param_json["config_project"] if "config_project" in param_json else {}

1518 format = config_project["saxia"]["format"] if "saxia" in config_project and "format" in config_project["saxia"] else {}

1519 format_json_from_conf = format["info_format_intro"] if "info_format_intro" in format else {}

1520 default_format_intro_hc = "Le {datet}, {document_type} par le Docteur {medecin_nom}, {medecin_specialite} :"

1521 format_info = format_json_from_conf["format"] if "format" in format_json_from_conf else {}

1522 info_format_intro = format_json_from_conf["format"]["intro"] if "format" in format_json_from_conf and "intro" in format_json_from_conf["format"] else {"default": default_format_intro_hc}

1523

1524 exec_if_true = param_json["exec_if_true"] if "exec_if_true" in param_json else True

1525 if not exec_if_true or exec_if_true == {}:

1526 print(" dont_exec_if_false is True, we skip the formatting step ")

1527 return input

1528 print("PAssed exec if true formatting step ")

1529

1530 append_resume = param_json["append_resume"] if "append_resume" in param_json else False

1531 content_resume = input["content_resume"] if "content_resume" in input else ""

1532

1533 try :

1534 print(" keys input : " + str(input.keys()))

1535 length_input = {k : len(input[k]) if (type(input[k]) != bool and str(type(input[k])) != "<class 'NoneType'>") else 0 for k in input}

1536 print(" length_input : " + str(length_input))

1537 except Exception as e:

1538 print("ERROR Problem with input : " + str(e) + " treated as WARNING ")

1539

1540# "compte_rendu_complet_medecin": "from_json_copy"

1541 compte_rendu_complet_medecin = param_json["compte_rendu_complet_medecin"] if "compte_rendu_complet_medecin" in param_json else ""

1542

1543 list_class_copy = param_json["list_class_copy"] if "list_class_copy" in param_json else []

1544 append_table_doc = (param_json["append_table_doc"] == 1 or param_json["append_table_doc"].lower() == "true") if "append_table_doc" in param_json else False

1545 append_table_page = (param_json["append_table_page"] == 1 or param_json["append_table_page"].lower() == "true") if "append_table_page" in param_json else False

1546 with_hyperlink = (param_json["with_hyperlink"] == 1 or str(param_json["with_hyperlink"]).lower() == "true") if "with_hyperlink" in param_json else False

1547 append_parsing_meta_info_to_table = (param_json["append_parsing_meta_info_to_table"] == 1 or str(param_json["append_parsing_meta_info_to_table"]).lower() == "true") if "append_parsing_meta_info_to_table" in param_json else False

1548 reproduce_format_new_page = (param_json["reproduce_format_new_page"] == 1 or str(param_json["reproduce_format_new_page"]).lower() == "true") if "reproduce_format_new_page" in param_json else False

1549 reorder_paragraph_by_order_lex_token = (param_json["reorder_paragraph_by_order_lex_token"] == 1 or str(param_json["reorder_paragraph_by_order_lex_token"]).lower() == "true") if "reorder_paragraph_by_order_lex_token" in param_json else False

1550 smart_new_line_from_token_pos = (param_json["smart_new_line_from_token_pos"] == 1 or str(param_json["smart_new_line_from_token_pos"]).lower() == "true") if "smart_new_line_from_token_pos" in param_json else False

1551 order_by_date = (param_json["order_by_date"] == 1 or str(param_json["order_by_date"]).lower() == "true") if "order_by_date" in param_json else False

1552 order_by_document_type = (param_json["order_by_document_type"] == 1 or str(param_json["order_by_document_type"]).lower() == "true") if "order_by_document_type" in param_json else False

1553 result_input = input["result"] if "result" in input else ""

1554 df = input["df"] if "df" in input else "None"

1555 df_by_page = input["df_by_page"] if "df_by_page" in input else "None"

1556 input_col_intro = input["input_col_intro"] if "input_col_intro" in input else "intro_correct_typo"

1557 input_col_cr = input["input_col_cr"] if "input_col_cr" in input else "cr_correct_typo"

1558 load_df_from_db_and_correct = (str(input["load_df_from_db_and_correct"]) == "1" or str(input["load_df_from_db_and_correct"]).lower() == "true") if "load_df_from_db_and_correct" in input else False

1559 out_file = input["out_file"] if "out_file" in input else ""

1560 nb_blank_line = param_json["nb_blank_line"] if "nb_blank_line" in param_json else 0

1561

1562 hash_id_treatment = input["hash_id_treatment"] if "hash_id_treatment" in input else str(uuid4())

1563

1564 from lib.lib_util import parse_json_from_prompt_result, format_one_res, complete_date_and_order_json_to_mettre_en_forme, append_id_by_order

1565 list_json_to_mettre_en_forme = parse_json_from_prompt_result(result_input)

1566 list_json_to_mettre_en_forme = append_id_by_order(list_json_to_mettre_en_forme)

1567

1568 import pandas as pd

1569 nb_doc = len(df) if type(df) == pd.DataFrame else 0

1570 nb_page_from_df = len(df_by_page) if type(df_by_page) == pd.DataFrame else 0

1571

1572# if order_by_date :

1573# list_json_to_mettre_en_forme = complete_date_and_order_json_to_mettre_en_forme(list_json_to_mettre_en_forme)

1574

1575 from lib.lib_util import create_prefix_file_name_from_json_prefix

1576 prefix_file = create_prefix_file_name_from_json_prefix(input["prefix_file"]) if "prefix_file" in input else ""

1577

1578 df_complet_as_markdown = ""

1579 df_complet_as_json = ''

1580 nb_modif_manual = -1

1581 nb_modif_class_manual = -1

1582 nb_manual_action_df = -1

1583 nb_manual_action_df_for_col_audit = -1

1584 total_text = ""

1585 result_output = "" # TODO duplicate

1586 if verbose :

1587 print(" load_df_from_db_and_correct : " + str(load_df_from_db_and_correct))

1588 else :

1589 print(" load_df_from_db_and_correct : " + str(load_df_from_db_and_correct)[:100])

1590 out_folder = input["out_folder"] if "out_folder" in input else "temp"

1591

1592 from server.safia import lpgss_singleton

1593 project_id = input["project_id"] if "project_id" in input else param_json["project_id"] if "project_id" in param_json else 70

1594 conf_project = lpgss_singleton.load_conf_project(project_id)

1595 format_json_from_conf = conf_project["saxia"]["format"]["info_format_intro"] if "saxia" in conf_project and "format" in conf_project["saxia"] and "info_format_intro" in conf_project["saxia"]["format"] else {}

1596 info_format_intro_bis = format_json_from_conf["format"]["intro"] if "format" in format_json_from_conf and "intro" in format_json_from_conf["format"] else {"default": default_format_intro_hc}

1597

1598 print(" info_format_intro and info_format_intro_bis should be the same !")

1599

1600 outfile_name_docx = ""

1601 audit_info_count = {}

1602 audit_info_write = {}

1603 # consolidate

1604 if load_df_from_db_and_correct:

1605 from lib.lib_safia_system import LibSafiaSystem

1606 user_id = 0

1607 from server.safia import lib_right_singleton

1608 lss = LibSafiaSystem(lib_user_data_internal=lpgss_singleton, lib_right=lib_right_singleton)

1609 lss.user_id = user_id # CA c'est un hack

1610 hash_id_treatment = input["hash_id_treatment_input"] if "hash_id_treatment_input" in input else "default_value_hash_id_treatment"

1611

1612 from lib.manaudit.lib_datou_audit import load_audit_info_and_apply_manual_correction, list_action_by_user, count_time_lab_by_user

1613 try :

1614

1615 df_auto, df_cons, hash_id_treatment_rerun, results, audit_info_count, id_file, audit_json_file_content_as_json, all_results, info_date = load_audit_info_and_apply_manual_correction(hash_id_treatment_auto = hash_id_treatment,

1616 hash_id_treatment_manual = hash_id_treatment,

1617 lpgss = lss.lib_user_data_internal,

1618 project_id = project_id)

1619

1620 map_user_id_list_page, map_user_id_time_modif, map_user_id_list_pages_for_split = list_action_by_user(all_results, df_auto)

1621 map_interval_sum_by_user = count_time_lab_by_user(all_results)

1622 nb_correction_split = sum(list(map(lambda x: len(map_user_id_list_pages_for_split[x]), map_user_id_list_pages_for_split)))

1623 if nb_correction_split == 0:

1624 split_auto_perfect = True

1625 else:

1626 split_auto_perfect = False

1627 if len(map_user_id_list_page) == 2:

1628 print("We expect a labeliser and a corrector ")

1629 user_id_0 = list(map_user_id_list_page.keys())[0]

1630 user_id_1 = list(map_user_id_list_page.keys())[1]

1631 nb_correction_0 = len(map_user_id_list_page[user_id_0])

1632 nb_correction_1 = len(map_user_id_list_page[user_id_1])

1633 if nb_correction_0 < nb_correction_1:

1634 user_id_labeliser = user_id_1

1635 user_id_corrector = user_id_0

1636 elif nb_correction_0 == nb_correction_1:

1637 print("WARNING EQUAL NUMBER OF CORRECTION BETWEEN LABELISER AND CORRECTOR, WE TAKE THE FIRST AS LABELISER")

1638 user_id_labeliser = user_id_0

1639 user_id_corrector = user_id_1

1640 else :

1641 user_id_labeliser = user_id_0

1642 user_id_corrector = user_id_1

1643

1644 nb_page_no_correction = len(map_user_id_list_page[user_id_labeliser]) - len(map_user_id_list_page[user_id_corrector])

1645 max_nb_page = max(map_user_id_list_page[user_id_labeliser])

1646 time_minute_labelizer = map_interval_sum_by_user[user_id_labeliser]["total_minutes"] if user_id_labeliser in map_interval_sum_by_user and "total_minutes" in map_interval_sum_by_user[user_id_labeliser] else 0.0

1647 time_minute_corrector = map_interval_sum_by_user[user_id_corrector]["total_minutes"] if user_id_corrector in map_interval_sum_by_user and "total_minutes" in map_interval_sum_by_user[user_id_corrector] else 0.0

1648 nb_interval_labelizer = len(map_interval_sum_by_user[user_id_labeliser]["intervals"]) if user_id_labeliser in map_interval_sum_by_user and "intervals" in map_interval_sum_by_user[user_id_labeliser] else 0

1649 nb_interval_corrector = len(map_interval_sum_by_user[user_id_corrector]["intervals"]) if user_id_corrector in map_interval_sum_by_user and "intervals" in map_interval_sum_by_user[user_id_corrector] else 0

1650 info_correction = " nb_page_no_correction : " + str(nb_page_no_correction) + " pourcentage perfect : " + str(int(100 * float(nb_page_no_correction / float(max_nb_page)))) + \

1651 " user_id_labeliser : " + str(user_id_labeliser) + " with " + str(len(map_user_id_list_page[user_id_labeliser])) + f" corrections in {time_minute_labelizer:.2f} minutes in {nb_interval_labelizer} intervals " + \

1652 " user_id_corrector : " + \

1653 str(user_id_corrector) + " with " + str(len(map_user_id_list_page[user_id_corrector])) + f" corrections in {time_minute_corrector:.2f} minutes in {nb_interval_corrector} intervals " + \

1654 ", nb_page_no_correction : " + str(nb_page_no_correction)

1655 nb_page_perfect = nb_page_no_correction

1656 else :

1657 time_minute_labelizer = -1

1658 time_minute_corrector = -1

1659 nb_interval_labelizer = -1

1660 nb_interval_corrector = -1

1661 info_correction = str(len(map_user_id_list_page)) + " users found in correction, we cannot separate labeliser and corrector : " + str(map_user_id_list_page)

1662

1663 if time_minute_labelizer == 0.0 or time_minute_corrector == 0.0:

1664 info_correction += " WARNING time_minute_labelizer : " + str(time_minute_labelizer) + " or time_minute_corrector : " + str(time_minute_corrector) + " is zero "

1665 info_correction += " map_interval_sum_by_user : " + str(map_interval_sum_by_user)

1666

1667 info_correction += " split_auto_perfect : " + str(split_auto_perfect) + " nb_correction_split : " + str(nb_correction_split)

1668

1669 #if type(df_auto_as_json) != types.NoneType:

1670 # df_auto = pd.DataFrame(df_auto_as_json, convert_dates=["datet", "date_entree_hospitalisationt", "date_sortie_hospitalisationt", "date_fin_arret_travailt", "date_debut_arret_travailt"])

1671 nb_modif_manual = sum(list(map(lambda x: len(x["manual_input_info"]["list_actions"]), all_results)))

1672 try:

1673 # nb_modif_class_manual = sum(list(map(lambda x: np.count_nonzero([0 if a["type_action"] != "class_paragraph" else 1 for a in x["manual_input_info"]["list_actions"]]), all_results)))

1674 nb_modif_class_manual = audit_info_count["nb_modif_class_manual"] if "nb_modif_class_manual" in audit_info_count else -2

1675 nb_manual_action_df = audit_info_count["nb_manual_action_df"] if "nb_manual_action_df" in audit_info_count else -2

1676 nb_manual_action_df_for_col_audit = audit_info_count["nb_manual_action_df_for_col_audit"] if "nb_manual_action_df_for_col_audit" in audit_info_count else -2

1677 print(" Faisons mieux au dessus ")

1678 except Exception as e:

1679 print("Error while counting class_paragraph : " + str(e))

1680

1681 df = results["df"]

1682

1683# id_file = all_result["id_file"] if "id_file" in all_result else ""

1684

1685 nb_doc = len(df)

1686 try:

1687 nb_page = len(audit_json_file_content_as_json["io_exec"]["3"]["input"]["paragraphs"])

1688 except Exception as e:

1689 print(str(e))

1690 nb_page = -1

1691

1692 print("TODO warning 14-5-24 ce code doit etre dedupliquer vu que c'est le meme dans les deux parties de la condition consolidate et l'autre")

1693 if order_by_date:

1694 from lib.lib_util import order_df_by_date

1695 df = order_df_by_date(df)

1696

1697 if order_by_document_type:

1698 from lib.lib_util import order_by_document_type

1699 df = order_by_document_type(df)

1700

1701 # Argument to modularize : df, input_col_intro, input_col_cr, out_file, hash_id_treatment

1702 # Output : total_text

1703 # why not output nb_file, nb_page, nb_modif_manual,

1704 from lib.lib_util import write_table_list_inner_document_0424_bis

1705 out_file = id_file + "_h_" + out_file

1706 total_text, outfile_name_docx, audit_info_write = write_table_list_inner_document_0424_bis(df, input_col_intro, input_col_cr, out_file, hash_id_treatment, out_folder, format_info, verbose=verbose,

1707 content_resume=content_resume, append_resume=append_resume)

1708

1709 except Exception as e:

1710 print("Error while loading and correcting df : " + str(e))

1711

1712 else: # format from initial run (not consolidate)

1713 df_auto = None

1714 print(" df len 1058 : " + str(len(df)))

1715 document = None

1716 out_file = ""

1717

1718 print(" df len 1062 : " + str(len(df)))

1719

1720 if compte_rendu_complet_medecin == "from_json_copy":

1721 from lib.lib_util import split_list_page_by_doc

1722 text = input["text"] if "text" in input else None

1723 if text != None and (

1724 len(input["list_page_per_doc"]) == len(text) or (len(input["list_page_per_doc"]) == 1 and type(text) == str)):

1725 print(

1726 "We expect list_text to be the same object, we force it when length is one (ie ONE DOC) : VR 3-6-25")

1727 if type(text) == str:

1728# list_texts = [text]

1729 list_list_page_doc = [[]] # en fait on aura besoin des paragraphs, refacto necessaire !

1730 list_list_page_doc = [[]]

1731 try:

1732 list_list_page_doc = split_list_page_by_doc(input["paragraphs"], input["list_page_per_doc"])

1733 except Exception as e:

1734 print("l 1051 If only one doc it could run !, and maybe also with multiple doc by the way")

1735 print(str(e))

1736

1737 else :

1738 list_list_page_doc = [None] * len(list_json_to_mettre_en_forme)

1739

1740 if len(list_list_page_doc) != len(list_json_to_mettre_en_forme):

1741 print("ERROR TREATED AS WARNING BUT MAKES HUGE ERROR OR MISSING DATA Problem with list_list_page_doc and list_json_to_mettre_en_forme : " + str(len(list_list_page_doc)) + " != " + str(len(list_json_to_mettre_en_forme)))

1742

1743 if append_parsing_meta_info_to_table:

1744 from lib.lib_util import add_parsing_meta_info_to_table

1745 df = add_parsing_meta_info_to_table(df, list_json_to_mettre_en_forme)

1746 else:

1747 print(" ERROR if we do not add 'id' to df, we will not be able to merge with manual correction")

1748 print(" Furthermore if I want to refacto the loop for exporting automatic datou results, I will have to add 'id' to df and also use df instead of list of json")

1749

1750 print(" df len 1077 : " + str(len(df)))

1751

1752 format_out_file = "docx"

1753 out_file = prefix_file + "_" + hash_id_treatment + "." + format_out_file

1754

1755 print(" df len 1091 : " + str(len(df)))

1756

1757 # not used anymore, set as option or remove

1758 if False:

1759 from lib.lib_util import write_table_list_inner_document

1760 import pandas as pd

1761 if append_table_doc and type(df) == pd.DataFrame:

1762 document = write_table_list_inner_document(df, document, with_hyperlink)

1763 if append_table_page and type(df_by_page) == pd.DataFrame:

1764 document = write_table_list_inner_document(df_by_page, document, with_hyperlink=False)

1765

1766 index_for_hyperlink = 0

1767 for index, row in df.iterrows():

1768 one_json = row.to_dict()

1769 list_paragraph_doc = []

1770 if index < len(list_list_page_doc):

1771 list_page_doc = list_list_page_doc[index]

1772 else :

1773 print(" ERROR missing list page doc and we have not verified the rest !")

1774 continue

1775 # et non mais on pourrait faire des verifications TODO 26-4-24 one_json["Liste des pages"] if "Liste des pages" in one_json else ""

1776# for one_json, list_page_doc in zip(list_json_to_mettre_en_forme, list_list_page_doc):

1777 try :

1778 if "document_type" not in one_json:

1779 document_type = "default"

1780 print(" Missing document_type in one result " + str(one_json))

1781 else:

1782 if type(one_json["document_type"]) == list:

1783 document_type = one_json["document_type"][0]

1784 print(" document_type is a list : " + str(one_json["document_type"]) + " treated as " + str(document_type))

1785 else:

1786 document_type = one_json["document_type"]

1787

1788 if compte_rendu_complet_medecin == "from_json_copy":

1789 print(" How to be sure it is the same doc as in the list_page_doc ?? Et oui je crois que grace à l'id ou le fait qu'on a fusionné !")

1790 from lib.lib_util import concat_content_from_list_page_doc

1791 if reorder_paragraph_by_order_lex_token:

1792 print("WILL FAIL WE NEED TO CHECK IF WE HAVE token in list_page_content ")

1793 print(" list_page_content : " + str(list_page_content))

1794 from_json_content_copy = concat_content_from_list_page_doc(list_page_doc,

1795 reproduce_format_new_page=reproduce_format_new_page,

1796 height_line=0,

1797 reorder_paragraph_by_order_lex_token = reorder_paragraph_by_order_lex_token,

1798 smart_new_line_from_token_pos = smart_new_line_from_token_pos,

1799 list_class_copy = list_class_copy)

1800 one_json["compte_rendu_complet_medecin"] = from_json_content_copy

1801 print(" from_json_content_copy : " + str(from_json_content_copy)[:100])

1802 df.loc[index_for_hyperlink, "cr_back"] = "empty"

1803

1804# df.loc[index_for_hyperlink, "cr_back"] = from_json_content_copy.replace("\n\n", "\n")

1805 df.loc[index_for_hyperlink, ['cr_back']] = [from_json_content_copy.replace("\n\n", "\n")]

1806 else:

1807 if type(one_json["compte_rendu_complet_medecin"]) == list:

1808 print(" ERROR OR WARNING How to handle compte_rendu_complet_medecin as list : " + str(one_json["compte_rendu_complet_medecin"]))

1809 df.loc[index_for_hyperlink, "cr_back"] = str(one_json["compte_rendu_complet_medecin"])

1810

1811 # TODO remove VR 25-4-24 : certainement useless

1812 if out_file == "":

1813 out_file = prefix_file + "_" + hash_id_treatment + "." + format_out_file

1814

1815 new_format_info = info_format_intro[document_type] if document_type in info_format_intro else info_format_intro["default"]

1816 print(" new_format_info : " + str(new_format_info) + " document_type : " + str(document_type) + " info_format_intro.keys : " + str(info_format_intro.keys()))

1817

1818 format_premier = format_info["format_premier"] if "format_premier" in format_info else "default"

1819 format_date = format_info["format_date"] if "format_date" in format_info else "default"

1820 list_variable_underline = format_info["list_variable_underline"] if "list_variable_underline" in format_info else []

1821 list_variable_bold = format_info["list_variable_bold"] if "list_variable_bold" in format_info else []

1822

1823 new_intro = format_one_res(one_json, new_format_info, format_premier, format_date, verbose=verbose,

1824 list_variable_bold=list_variable_bold, list_variable_underline=list_variable_underline)

1825 df.loc[index_for_hyperlink, "intro_back"] = new_intro

1826 # TODO sans doute à garder pratique pour debugguer mais en fait calculer par write ...

1827 result_output += new_intro

1828 index_for_hyperlink += 1 # TODO renommer

1829

1830 except Exception as e:

1831 print("Error while parsing one result : " + str(e))

1832

1833 print(" df len 1147 : " + str(len(df)))

1834

1835# input_col_intro = "intro_back"

1836# input_col_cr = "cr_back"

1837# from lib.lib_util import write_table_list_inner_document_0424_bis

1838# total_text = write_table_list_inner_document_0424_bis(df, input_col_intro, input_col_cr, out_file,

1839# hash_id_treatment, out_folder)

1840 # A refacto

1841 # with_hyperlink

1842 # reproduce_format_new_page

1843

1844

1845 try:

1846 if order_by_date:

1847 from lib.lib_util import order_df_by_date

1848 df = order_df_by_date(df)

1849 except Exception as e:

1850 print("CHECK !")

1851 print(str(e))

1852

1853 if order_by_document_type:

1854 from lib.lib_util import order_by_document_type

1855 df = order_by_document_type(df)

1856

1857 if nb_blank_line > 0:

1858 from lib.lib_util import add_blank_line

1859 try:

1860 df = add_blank_line(df, nb_blank_line)

1861 except Exception as e:

1862 print(str(e))

1863

1864

1865 import pandas as pd

1866 df_complet_as_markdown = df.to_markdown() if type(df) == pd.DataFrame else ""

1867 df_complet_as_json = df.to_json() if type(df) == pd.DataFrame else ""

1868 df_auto = None

1869

1870# print(" df_complet_as_markdown len 1152 : " + str(len(df_complet_as_markdown)))

1871 print(" out_folder : " + out_folder)

1872

1873 input_col_intro = "intro_back"

1874 input_col_cr = "cr_back"

1875 from lib.lib_util import write_table_list_inner_document_0424_bis

1876 try:

1877 total_text, outfile_name_docx, audit_info_write = write_table_list_inner_document_0424_bis(df, input_col_intro, input_col_cr, out_file,

1878 hash_id_treatment, out_folder, format_info)

1879 except Exception as e:

1880 print(str(e))

1881 print("Error while computing plop write_table_list_inner_document_0424_bis format")

1882

1883 output = {"result" : result_output, "out_file" : out_file,

1884 "df_complet_as_markdown" : df_complet_as_markdown,

1885 "df_complet_as_json" : df_complet_as_json,

1886 "nb_doc" : nb_doc,

1887 "nb_page_from_df" : nb_page_from_df,

1888 "nb_word_result" : total_text.count(" ") + 1,

1889 "nb_modif_manual" : nb_modif_manual,

1890 "nb_doc_modif_correct_test_2812" : "test_integration_prime_productivite",

1891 "prime_productivite_test_2812" : "test_integration_prime_productivite",

1892 "info_correction" : info_correction if 'info_correction' in locals() else "",

1893 "nb_page_perfect" : nb_page_perfect if 'nb_page_perfect' in locals() else -1,

1894 "time_minute_labelizer" : time_minute_labelizer if 'time_minute_labelizer' in locals() else -1.0,

1895 "user_id_labeliser" : user_id_labeliser if 'user_id_labeliser' in locals() else -1,

1896 "split_auto_perfect" : split_auto_perfect if 'split_auto_perfect' in locals() else None,

1897 "nb_modif_class_manual" : nb_modif_class_manual,

1898 "prefix_file" : prefix_file,

1899 "nb_manual_action_df_for_col_audit" : nb_manual_action_df_for_col_audit,

1900 "nb_manual_action_df" : nb_manual_action_df,

1901 "out_folder" : out_folder,

1902 "df" : df,

1903 "outfile_name_docx" : outfile_name_docx,

1904 "content_commemo" : total_text}

1905

1906 print(str(audit_info_count.keys()))

1907

1908 # Temporaire !

1909 path_csv_complete_with_prediag = "/Users/moilerat/Documents/Fotonower/Safia/prompt/misc/csv_prediag_all_almost/all_csv_prediag.csv"

1910# path_csv_complete_with_prediag = "/home/safia/workarea/git/Safia/prompt/python/misc/csv_prediag_all_almost/all_csv_prediag.csv"

1911 if os.path.exists(path_csv_complete_with_prediag):

1912 try:

1913 import pandas as pd

1914 df_prediag, _ = pd.read_csv(path_csv_complete_with_prediag, sep=";")

1915

1916 from lib.sandbox.migration import migrate_df_complete_with_prediag

1917 df = migrate_df_complete_with_prediag(df_prediag, df, id_file)

1918

1919 except Exception as e:

1920 print("Error while loading csv complete with prediag : " + str(e))

1921

1922 #if limit == 0:

1923 print(" if limit == 0 (mais d'ou le trouve t'on grrr) On pourrait aussi insérer le prediag dans le df et le renvoyer dans le json")

1924

1925 output["map_count_modif_per_doc"] = audit_info_count["map_count_modif_per_doc"] if "map_count_modif_per_doc" in audit_info_count else {}

1926 output["map_modif_type_document"] = audit_info_count["map_modif_type_document"] if "map_modif_type_document" in audit_info_count else {}

1927 output["audit_info_write"] = audit_info_write

1928 try :

1929 for col_ref in ["medecin_nom", "medecin_prenom", "document_type", 'Liste des pages', 'Nombre de pages', 'Titre', 'medecin_specialite', 'nom_hopital', 'genre_service_hopital',

1930 'indication_examen', 'date_entree_hospitalisation',

1931 'date_sortie_hospitalisation', 'motif_hospitalisation',

1932 'date_fin_arret_travail', 'TitreMeta', 'datet', 'date_parsed_or_forced',

1933 'date_fin_arret_travailt', 'date_entree_hospitalisationt', 'date_sortie_hospitalisationt']:

1934 col_auto = col_ref + "_auto"

1935 df[col_auto] = "None"

1936

1937 if col_ref not in df.columns:

1938 print("Missing col ref : " + str(col_ref) + " in df")

1939 continue

1940

1941 if type(df_auto) == pd.DataFrame:

1942 df_auto['Nombre de pages'] = "None"

1943 for index, row in df_auto.iterrows():

1944 if 'Liste des pages' in df_auto.columns:

1945 df_auto.loc[index, 'Nombre de pages'] = str(len(row['Liste des pages'].split(','))) if type(row['Liste des pages']) == str else "None"

1946

1947 if index < len(df):

1948 found_list = df.index[df['Liste des pages'] == row['Liste des pages']].tolist()

1949 for col_ref in ["medecin_nom", "medecin_prenom", "document_type", 'Liste des pages',

1950 'Nombre de pages', 'Titre', 'medecin_specialite', 'nom_hopital',

1951 'genre_service_hopital',

1952 'indication_examen', 'date_entree_hospitalisation',

1953 'date_sortie_hospitalisation', 'motif_hospitalisation',

1954 'date_fin_arret_travail', 'TitreMeta', 'datet', 'date_parsed_or_forced',

1955 'date_fin_arret_travailt', 'date_entree_hospitalisationt',

1956 'date_sortie_hospitalisationt']:

1957 col_auto = col_ref + "_auto"

1958# df[col_auto] = "None"

1959

1960 if col_ref not in df.columns:

1961 print("Missing col ref : " + str(col_ref) + " in df")

1962 continue

1963

1964 if len(found_list) == 0:

1965 print("Missing idx for liste pages " + str(row['Liste des pages']))

1966 if len(found_list) > 1:

1967 print("Warning : more than one idx for liste pages " + str(row['Liste des pages']) + " found : " + str(found_list))

1968

1969 for idxf in found_list:

1970 df.loc[idxf, col_auto] = row[col_ref]

1971# df.loc[index, col_auto] = row[col_ref]

1972 else :

1973 print("Inconsistent dataframe auto and df : " + str(index) + " > " + str(len(df)))

1974

1975 print(" df_auto : " + str(df_auto))

1976 output["df_cons"] = df.to_json()

1977 # Test write and load df.to_dict()

1978 output["df_cons"] = df.to_dict()

1979 output["df_auto"] = df_auto.to_json()

1980 except Exception as e:

1981 print(str(e))

1982 if "hash_id_treatment_input" in input:

1983 output["hash_id_treatment_input"] = input["hash_id_treatment_input"]

1984

1985 return output

1986

1987def datou_safia_step_classify_doc(input : dict = {},

1988 param_json : dict = {},

1989 ce : CE = None,

1990 verbose : bool = False,

1991 layer_api : LayerGeneric = None) -> dict :

1992 list_input = []

1993 list_output = []

1994 list_param_json = []

1995 paragraphs = input["paragraphs"] if "paragraphs" in input else []

1996 rules_classifier = param_json["rules_classifier"] if "rules_classifier" in param_json else []

1997 task = input["task"] if "task" in input else param_json["task"] if "task" in param_json else ""

1998

1999 list_detects = rules_classifier["detect"] if "detect" in rules_classifier else "detect"

2000 map_classifier = rules_classifier["classify"] if "classify" in rules_classifier else "classify"

2001 input_format = param_json["input_format"] if "input_format" in param_json else "markdown"

2002

2003 # task : re_classifier,prepare_prompt,parse_result,merge_result,classify_doc

2004

2005 # "taxonomy_text": {"header":{"key":"H","description":"En-tete et pied de page : adresse(s) de la structure, nom du/des medecins, numéro du secretariat, raison social, Logo, numéro des pages, les titres et diplomes, condition de paiement"},"info_medecin":{- Spécialité et Nom du médecin (des fois en en-tete ou signature (eventuellement tampon) )\n- Date du jour (ou de l'edition)\n- Données personnels du patient : ...\n- Contenu pertinent : \n - A faire selon les classes de document\n - Par défaut\n- Ecriture manuscrite\n- Document administratif (CNI)\n- Autres : Règle administrative, preuve du respect du secret médical\nCertificat établi à la demande de l'intéressé et remis en main propre pour servir et faire valoir ce que de droit.}

2006 taxonomy_text = param_json["taxonomy_text"] if "taxonomy_text" in param_json else {"taxonomy_text":

2007 {"header":

2008 {"key":"H",

2009 "description":"En-tete et pied de page : adresse(s) de la structure, nom du/des medecins, numéro du secretariat, raison social, Logo, numéro des pages, les titres et diplomes, condition de paiement"

2010 },

2011 "info_medecin":

2012 {"key":"M",

2013 "description":"- Spécialité et Nom du médecin (des fois en en-tete ou signature (eventuellement tampon) )\n- Date du jour (ou de l'edition)\n- Données personnels du patient : ...\n- Contenu pertinent : \n - A faire selon les classes de document\n - Par défaut\n- Ecriture manuscrite\n- Document administratif (CNI)\n- Autres : Règle administrative, preuve du respect du secret médical\nCertificat établi à la demande de l'intéressé et remis en main propre pour servir et faire valoir ce que de droit."},

2014 "content":

2015 {"key":"C","description":"Contenu pertinent : description specifique du cas du patient"},

2016 "manuscrit":

2017 {"key":"A","description":"Mots tronqués ou mal orthographier, alphabet étrangère"},

2018 "admin":{"key":"D","description":"Document administratif (CNI) ou autre document administratif"},

2019 "autre":{"key":"O","description":"Autres : Règle administrative, preuve du respect du secret médical, par exemple le texte : Certificat établi à la demande de l'intéressé et remis en main propre pour servir et faire valoir ce que de droit."}

2020 }

2021 }

2022# map_taxonomy_per_key = {taxonomy_text[k]["key"] : k if "key" in taxonomy_text[k] else k : k for k in taxonomy_text}

2023 try :

2024 map_taxonomy_per_key = {taxonomy_text[k]["key"] : k for k in taxonomy_text}

2025 except Exception as e:

2026 print(str(e))

2027 map_taxonomy_per_key = {}

2028

2029 if task == "re_classifier":

2030 from lib.lib_ml.lib_text_classifier import classify_text

2031 for page in list_page_content:

2032 for paragraph in page.list_blocks["paragraphs"]:

2033 for detect in list_detects:

2034 res = classify_text(paragraph["text"], list_detects[detect], verbose = verbose)

2035 if len(res) > 0:

2036 print("detect : " + str(detect) + " res : " + str(res))

2037 if detect in map_classifier:

2038 paragraph["class"] = map_classifier[detect]

2039 else :

2040 print("Missing class, by default we keep it as content_classn but later since this case is not used in production as of 4-3-24")

2041 elif task == "prepare_prompt": # prepare_prompt_classifier_bib

2042 from lib.prompt.lib_gen_prompt import generate_prompt_classifier

2043 preprompt = generate_prompt_classifier(taxonomy_text, verbose = verbose, output_type = input_format)

2044 # TODO add in input input_type = "json", output_type = "table",

2045 # language = "fr",

2046 # output_key = "id", output_class = "classe",

2047 # separator

2048 text = str(paragraphs[0]) if len(paragraphs) > 0 else ""

2049 output = {"preprompt" : preprompt, "text" : text}

2050 elif task == "parse_result": # parse_result_prompt_classifier_bib

2051 result = input["result"] if "result" in input else ""

2052 from lib.batch.lib_batch import create_pandas_table_from_text

2053 from lib.lib_util import parse_json_from_prompt_result

2054

2055 if input_format == "markdown":

2056 df = create_pandas_table_from_text(result, verbose = verbose)

2057 elif input_format == "json":

2058 df = parse_json_from_prompt_result(result, verbose = verbose)

2059 else :

2060 print("ERROR Unsupported input_format : " + str(input_format))

2061 # df = {}

2062 print("TODO finish voila (add in document if only one document)")

2063 if len(paragraphs) != 1:

2064 print("only one document is managed since we use table as output for prompt !")

2065 else :

2066 output_key = "id"

2067 output_class = "classe"

2068 for paragraph in paragraphs[0]:

2069 import sys

2070 sys.stdout.write("ç")

2071 if "id" not in paragraph:

2072 print("Missing id in paragraph : " + str(paragraph))

2073 continue

2074 id = paragraph["id"]

2075 data_found = df[df["id"] == str(id)]

2076 key_classes = data_found[output_class] if output_class in data_found else []

2077 if len(key_classes) == 1:

2078 key_class = key_classes[id]

2079 else:

2080 print("Missing class")

2081 key_class = "unknown"

2082# key_class = df[output_class][df["id"] == str(id)]

2083 paragraph["class"] = map_taxonomy_per_key[key_class] if key_class in map_taxonomy_per_key else "unknown"

2084 output = {}

2085

2086 elif task == "merge_result":

2087 pass

2088 elif task == "classify_doc":

2089 pass

2090 elif task == "detect_name_camembert": # context_entity_camembert

2091 from lib.lib_ml.lib_nlp.lib_pipeline_ner import detect_name_ner

2092 input_text = input["text"] if "text" in input else ""

2093 name_pretrained_model = param_json["name_pretrained_model"] if "name_pretrained_model" in param_json else "Jean-Baptiste/camembert-ner"

2094 name_tokenizer = param_json["name_tokenizer"] if "name_tokenizer" in param_json else "Jean-Baptiste/camembert-ner"

2095 what_I_want = param_json["what_I_want"] if "what_I_want" in param_json else "PER"

2096 aggregation_strategy = param_json["aggregation_strategy"] if "aggregation_strategy" in param_json else "simple"

2097 list_to_treat = [input_text]

2098 if False and "list_page_content" in input:

2099 list_page_content = input["list_page_content"]

2100 list_to_treat = []

2101 for l in list_page_content:

2102# list_to_treat.append(l["text"])

2103 list_to_treat.append(l.content)

2104

2105 all_list_name = []

2106 for input_text_aux in list_to_treat:

2107 list_name = detect_name_ner(input_text_aux,

2108 name_pretrained_model=name_pretrained_model,

2109 name_tokenizer=name_tokenizer,

2110 what_I_want=what_I_want,

2111 aggregation_strategy=aggregation_strategy)

2112 all_list_name.extend(list_name)

2113

2114 print(" list_name : " + str(all_list_name))

2115 output = {"pers" : all_list_name}

2116 output = {"pers" : all_list_name, "result" : "```json\n{\"PERS\":[" + ",".join(list(map(lambda x : "\"" + x + "\"", all_list_name))) + "]}\n```"}

2117 elif task == "parse_result_camembert": # parse_result_camembert

2118 list_name = input["pers"] if "pers" in input else []

2119

2120 pass

2121 elif task == "prepare_anon_from_camembert": # parse_result_camembert

2122 list_name = input["pers"] if "pers" in input else []

2123

2124 pass

2125 elif task == "create_entity_bib_from_camembert": # parse_result_camembert

2126 pass

2127 else :

2128 print("Unsupported task : " + str(task))

2129 output = {"result": "Unsupported task : " + str(task)}

2130

2131 import sys

2132 sys.stdout.write("O")

2133 return output

2134

2135

2136def datou_safia_step_client(input : dict = {},

2137 param_json : dict = {},

2138 ce : CE = None,

2139 verbose : bool = False,

2140 layer_api : LayerGeneric = None) -> dict : # pragma no cover

2141 list_input = ["file", "preprompt", "model", "files"]

2142 list_output = []

2143 list_param_json = ["host", "protocol", "port", "end_point", "preprompt"]

2144

2145 end_point = param_json["end_point"] if "end_point" in param_json else "api/v1/upload"

2146 host = param_json["host"] if "host" in param_json else "localhost"

2147 protocol = param_json["protocol"] if "protocol" in param_json else "http"

2148 port = param_json["port"] if "port" in param_json else 4998

2149

2150 file = input["file"] if "file" in input else ""

2151 files = input["files"] if "files" in input else []

2152 preprompt = input["preprompt"] if "preprompt" in input else param_json["preprompt"] if "preprompt" in param_json else ""

2153 model = input["model"] if "model" in input else ""

2154

2155 import logging

2156 logger = logging.getLogger()

2157 logger.info("In datou_safia_step_client l 2081 ") # + str(__line__))

2158

2159 print("file : " + str(file))

2160 print("files : " + str(files))

2161 print("preprompt : " + str(preprompt))

2162 print("model : " + str(model))

2163 # Send http form multipart request

2164 # TODO : send file and preprompt to the server

2165 # TODO : get result from server

2166 # TODO : return result

2167 import requests

2168 epls = end_point.lstrip('/')

2169 url = f"{protocol}://{host}:{port}/{epls}"

2170 if len(files) == 0:

2171 files.append(file)

2172 map_res_file = {}

2173 map_full_res_file = {}

2174 for f in files:

2175 import os

2176 fbn = os.path.basename(f)

2177 file_json = {'file': open(f, 'rb')}

2178 logger.info("In datou_safia_step_client l 2102 preprompt " + str(preprompt[:100])) # + str(__line__))

2179

2180 data = {'preprompt': preprompt} #, 'model': model}

2181 if model != None and model != '':

2182 data['model_name'] = model

2183 try :

2184 response = requests.post(url, files=file_json, data=data)

2185 if response.status_code == 200:

2186 print("File uploaded successfully")

2187 else:

2188 print("File upload failed")

2189

2190 logger.info("In datou_safia_step_client l 2112 response received ") # + str(__line__))

2191 logger.info("In datou_safia_step_client l 2112 response.status_code " + str(response.status_code)) # + str(__line__))

2192

2193 print("TO USE TO CREATE NEW STEP")

2194

2195 import json

2196 res_parsed_json = json.loads(response.content.decode('utf-8'))

2197 except Exception as e:

2198 print("Error while sending file to server in datou_Step_client : " + str(e))

2199 res_parsed_json = {}

2200

2201 some_useful_result = res_parsed_json["res"] if "res" in res_parsed_json else []

2202 one_useful_result = some_useful_result[0] if len(some_useful_result) > 0 else ""

2203 result_as_array = one_useful_result.split("assistant<|end_header_id|>")

2204 if len(result_as_array) > 1:

2205 the_useful_result = result_as_array[1].replace("\n", "").rstrip("<|eot_id|>")

2206 else :

2207 the_useful_result = ""

2208

2209 map_res_file[fbn] = the_useful_result

2210 map_full_res_file[fbn] = res_parsed_json

2211# assistant<|end_header_id|>

2212 #

2213 # TABLEAU<|eot_id|>

2214

2215 logger.info("In datou_safia_step_client l 2138 wip ")

2216 csv_result = ""

2217 for f in map_res_file:

2218 if csv_result != "":

2219 csv_result += ","

2220 csv_result += map_res_file[f]

2221

2222 output = {"map_result" : map_res_file,

2223 "result" : csv_result,

2224 "full_result" : map_full_res_file}

2226 return output

2230# Keep for easy implementation of new function (remove pragma no cover and complete all)

2231# Some time it will be also needed to do DEV DOC : INSERT INTO mtrdatou.datou_step_template (name, function_name, param_json_list, input_list, output_list) VALUES ('append_to_doc', 'datou_safia_step_append_to_doc_content', '["openai_token"]', '["result", "document_id", "project_id", "user_id"]', '["result", "references"]');

2232def datou_safia_step_TEMPLATE(input : dict = {},

2233 param_json : dict = {},

2234 ce : CE = None,

2235 verbose : bool = False,

2236 layer_api : LayerGeneric = None) -> dict : # pragma no cover

2237 list_input = []

2238 list_output = []

2239 list_param_json = []

2240

2241 print("TO USE TO CREATE NEW STEP")

2242

2243 output = {"result" : "some_result (but TEMPLATE)"}

2244

2245 return output

2246

2247

Coverage for lib/datou/lib_datou_step_template.py: 62%

1310 statements