Coverage for lib/datou/lib_datou_step_template.py: 62%
1310 statements
« prev ^ index » next coverage.py v7.9.1, created at 2026-02-10 01:10 +0100
« prev ^ index » next coverage.py v7.9.1, created at 2026-02-10 01:10 +0100
1import datetime
2import os.path
3import shutil
4import types
6import numpy as np
8from lib.brick_layers.lib_abstract_generic_layer import LayerGeneric, LayerPrompt
10# TODO ARCHI VR 14-6-23 : est-ce qu'on ferait une classe pour avoir les services de cost_estimation
11from auth.lib_cost import CostEstimation as CE
12from uuid import uuid4
14# speech_to_text
15def datou_safia_step_speech_to_text(input : dict, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict:
16 list_input = ["file"] # TODO VR REFACTO ARCHI : vérifier les entrées et sorties
17 list_output = ["text"]
18 list_param_json = ["openai_token"] # TODO VR REFACTO : avoir une interface d'édition ou bien un context d'ou on récupèrera ces infos (mais ce context dépendra potentiellement du projet pour spécifier le type de modèle, ou d'une "instanciation" d'un datou)
20 file = input["file"]
21 openai_token = param_json["openai_token"]
23 parse_prefix_file = bool(param_json["parse_prefix_file"]) if "parse_prefix_file" in param_json else False
25 # Hard-coded param
26 language = None
28 from lib.lib_speechtotext import speech_to_text
29 text = ""
30 length_time_seconds = 0
31 if file.endswith(".amr") or file.endswith(".ogg") or file.endswith(".mp4") or file.endswith(".webm"):
32 print("import convert_file")
33 from lib.lib_speechtotext import convert_file
34 print("calling convert_file")
35 new_file = convert_file(file)
36 print("calling speech_to_text")
37 text, length_time_seconds, model = speech_to_text(new_file, openai_token, language=language, verbose=verbose)
38# logger.info("Is length_time_seconds null ? " + str(length_time_seconds))
40 # TODO VR REFACTO : je n'aime pas ces deux appels dupliquer
41 elif file.endswith(".mp3") or file.endswith(".m4a") or file.endswith(".wav") :
42 import os
43 size = os.path.getsize(file)
44 print(" size : " + str(size))
46 if size > 10000000: # pragma no cover scale
47 print(" size : " + str(size))
48 from lib.lib_speechtotext import convert_file
49 print("calling convert_file")
50 new_file = convert_file(file)
51 size = os.path.getsize(new_file)
52 print(" size : " + str(size))
54 from lib.lib_speechtotext import split_mp3
55 nb_split = 1 + int(size / 10000000)
56 list_files = split_mp3(new_file, nb_split, verbose=verbose)
58 text = ""
59 length_time_seconds = 0
60 model = ""
61 for file_aux in list_files:
62 text_aux, length_time_seconds_aux, model = speech_to_text(file_aux, openai_token, language=language, verbose=verbose)
63 text += text_aux
64 length_time_seconds += length_time_seconds_aux
65 else :
66 text, length_time_seconds, model = speech_to_text(file, openai_token, language=language, verbose=verbose)
68 ce.compute_cost_search(model, length_time_seconds)
70 # TODO VR REFACTO : il faut aussi effacer les fichiers
72 output = {"text" : text}
74 if "preprompt" in input:
75 output["preprompt"] = input["preprompt"]
77 return output
81def sub_func_read_ocr(f, count, model, verbose, map_file_size, map_file_text, folder_export_boxes,
82 begin_page=False, end_page=False, file_output="output",
83 layer_api = None, vllm_model = None,
84 request_used = None):
86 from lib.lib_util import SubDocPage
87 from lib.lib_ocr import img_to_texte, ocr_google_vision, gcp_doc_ai
88 # VR TODO 9-5-25 Cette boucle existe en double et n'est donc testé qu'une fois, mais on ne sait pas laquelle
89 if model == "tesseract":
90 text, list_boxes, maxx, maxy, list_blocks = img_to_texte(f, verbose)
91 elif model == "prompt":
92 print("Prompt ! ")
93 list_boxes = []
94 maxx = 0
95 maxy = 0
96 list_blocks = {}
98# text = "TODO !"
99 if verbose:
100 print("before call request_gpt")
101 try :
102 if layer_api == None:
103 (result, nb_token, modele) = ("", 0, "")
104 else :
105 text, nb_token, modele = layer_api.prompt(request_used = request_used, gpt_model = vllm_model,
106 verbose = verbose,
107 images = [f])
108 except Exception as e:
109 print(str(e))
110 text, nb_token, modele = "", 0, "ERROR IN PROMPT"
113 elif model == "gcp_doc_ai":
114 if os.stat(f).st_size > 20000000:
115 print(" Expecting failure due to too big file : " + str(f) + " " + str(os.stat(f).st_size))
116 else:
117 print(" os.stat(f).st_size : " + str(os.stat(f).st_size))
118 try:
119 text, list_boxes, maxx, maxy, list_blocks = gcp_doc_ai(f, verbose=verbose)
120 except Exception as e:
121 print("ERROR TREATED AS WARNING THANKS RECUPERATION : OCR gcp_doc_ao FAILED on " + str(
122 f) + " We wil try the old one ! too bad if it is a cerfa ")
123 print(str(e))
124 text = ""
125 list_boxes = []
126 maxx = 0
127 maxy = 0
128 list_blocks = []
129 text, list_boxes, maxx, maxy, list_blocks = ocr_google_vision(f, verbose)
130 else: # google_ocr
131 text, list_boxes, maxx, maxy, list_blocks = ocr_google_vision(f, verbose)
132 if folder_export_boxes != "":
133 if not os.path.exists(folder_export_boxes):
134 os.makedirs(folder_export_boxes)
135 with open(folder_export_boxes + "/" + file_output + "_" + str(count) + ".json", "w") as of:
136 import json
137 of.write(json.dumps(list_boxes))
138 map_file_size[f] = len(text)
139 map_file_text[f] = text
140 sdp = SubDocPage(count, text, f, list_boxes, maxx, maxy, list_blocks)
142 return count, text, sdp
144# image_to_text
145def datou_safia_step_image_to_text(input : dict, param_json : dict = {}, ce : CE = None,
146 verbose : bool = False, layer_api : LayerGeneric = None) -> dict:
147 list_input = ["file"] # TODO VR REFACTO ARCHI : vérifier les entrées et sorties
148 list_output = ["text", "preprompt"] # TODO VR REFACTO : ce preprompt devrait etre fait ailleurs
149 list_param_json = ["google_token", "model"] # TODO VR REFACTO : avoir une interface d'édition ou bien un context d'ou on récupèrera ces infos (mais ce context dépendra potentiellement du projet pour spécifier le type de modèle, ou d'une "instanciation" d'un datou)
151 model = param_json["model"] if "model" in param_json else "google_ocr"
152 file = input["file"]
153 google_token = param_json["google_token"] if 'google_token' in param_json else None
154 dpi = param_json["dpi"] if "dpi" in param_json else 72
155 if google_token == None:
156 print(" Will crash or not !")
158 parse_prefix_file = bool(param_json["parse_prefix_file"]) if "parse_prefix_file" in param_json else True
159 print(" parse_prefix_file : " + str(parse_prefix_file))
160 parse_date_test_before_own_datou_step = bool(param_json["parse_date_test_before_own_datou_step"]) if "parse_date_test_before_own_datou_step" in param_json else False
161 hash_id_treatment = input["hash_id_treatment"] if "hash_id_treatment" in input else None
162 only_count = bool(param_json["only_count"]) if "only_count" in param_json else False
163 only_extract_page = bool(param_json["only_extract_page"]) if "only_extract_page" in param_json else False
165 # when multiple file in input as a raw split !
166 multi_input = input["multi_input"] if "multi_input" in input else False
167 saxia_split_end_csv = input["saxia_split_end_csv"].replace(" ", "") if "saxia_split_end_csv" in input else ""
168 saxia_all_doc_separated = input["saxia_all_doc_separated"] if "saxia_all_doc_separated" in input else False
169 use_split_complet = param_json["use_split_complet"] if "use_split_complet" in param_json else False
170 if saxia_all_doc_separated and (saxia_split_end_csv != "") and use_split_complet:
171 print("Here we want to not split with prompt, maybe we should use multi_input and if there is only file w ecould also activate this case ! !")
172 create_output_hit = bool(param_json["create_output_hit"]) if "create_output_hit" in param_json else False
174 if type(file) == list:
175 if len(file) == 0:
176 print("ERROR treated as WARNING : No Input file !, we can quit or not, it shouldn't matter ")
178 # temporary
179# file = file[0]
181 print(" ERROR treated as WARNING : only the first file will be treated : " + str(file))
182 print("TODO multiple files not implemented yet !")
183 if multi_input:
184 print("VR TODO 24/6/24 wip multi_input => is in fact working !")
185 else :
186 print(" We have not decided yet the default behavior VR TODO CDC 24/6/24 : for now this is an internal error to be in this situation")
187 one_file_reference = file[0]
188 else:
189 one_file_reference = file
190 file = [file]
192 if one_file_reference == None:
193 print(" ERROR treated as WARNING : No Input file reference !, we can quit or not, it shouldn't matter ")
194 size_file = os.stat(one_file_reference).st_size
195 created_at = datetime.datetime.fromtimestamp(os.stat(one_file_reference).st_ctime)
196 in_folder = os.path.dirname(one_file_reference)
197 work_folder_images = os.path.dirname(one_file_reference)
199 if one_file_reference.lower().endswith(".pdf"):
200 from lib.lib_util import from_pdf_to_list_pngs
201 list_pngs, count_per_doc, list_of_list_of_pages = from_pdf_to_list_pngs(file, dpi = dpi, hash_id_treatment = hash_id_treatment, only_count = only_count)
202 else :
203 if multi_input:
204 print("Internal error as of 24/6/24, behavior to be developped CDC TODO VR 24/6/24")
205 list_pngs = [one_file_reference]
206 list_of_list_of_pages = [[1]]
208 if len(list_pngs) == 0:
209 print("TO activate after some test !")
210# saxia_all_doc_separated = True
212 if saxia_all_doc_separated:
213 print(" We should avoid doing split with prompt and treat all different case !")
215 from lib.lib_util import parse_id_date_nb_page_folder
216 json_prefix_file = {}
217 if parse_prefix_file :
218 date_input = input["date"] if "date" in input else param_json["date"] if "date" in param_json else None
219 nb, id, date = parse_id_date_nb_page_folder(one_file_reference)
220 if id == 0:
221 id = param_json["id"] if "id" in param_json else 0
222 if nb == 0:
223 nb = len(list_pngs)
224 print(" date " + str(date) + " and id " + str(id) + " and nb " + str(nb) + " and file " + str(one_file_reference) + " and input keys " + str(input.keys()))
225 if date == None:
226 print("Using date_input as date " + str(date_input) + " and date " + str(date) + " and id " + str(id) + " and nb " + str(nb) + " and file " + str(one_file_reference) + " and input keys " + str(input.keys()))
227 if date_input == None:
228 date = datetime.datetime.now().strftime("%Y%m%d")
229 else :
231 if type(date_input) == str:
232 from lib.lib_util import parse_date
233 date, parsed_or_forced = parse_date(date_input, settings=None)
234 date = date.strftime("%Y%m%d")
235 else:
236 date = date_input.strftime("%Y%m%d")
237 json_prefix_file = {"nb" : nb,
238 "date" : date,
239 "id" : id}
240 filename_at = json_prefix_file["date"] if "date" in json_prefix_file else datetime.datetime.now().strftime("%Y%m%d 00:00:00")
242 from lib.lib_util import create_prefix_file_name_from_json_prefix
243 prefix_file = create_prefix_file_name_from_json_prefix(json_prefix_file)
245 print("keyword_to_parse_for_suivi_and_crash_id_file : " + str(prefix_file))
246 print("keyword_to_parse_for_suivi_and_crash_hit : " + str(hash_id_treatment))
248 if only_extract_page:
249 output = {"files" : list_pngs, "nb_page" : len(list_pngs)}
250 elif only_count:
251 output = {}
252 else :
253 map_file_size = {}
254 map_file_text = {}
256 begin_page = bool(param_json['begin_page']) if 'begin_page' in param_json else None
257 end_page = bool(param_json['end_page']) if 'end_page' in param_json else None
258 limit = param_json["limit"] if "limit" in param_json else 0
259 parallel = bool(param_json["parallel"]) if "parallel" in param_json else False
260 nb_thread = param_json["nb_thread"] if "nb_thread" in param_json else 10
262 folder_export_boxes = param_json['folder_export_boxes'] if 'folder_export_boxes' in param_json else ""
264 begin_page_txt = ""
265 end_page_txt = ""
268 complete_text = ""
269 list_page_content = []
270 list_page_content_text = []
271 file_output = prefix_file + "_" + input["hash_id_treatment"] if "hash_id_treatment" in input else prefix_file + "_" + str(uuid4())
273 request_used = input["preprompt"] if "preprompt" in input else None
274 vllm_model = param_json["vllm_model"] if "vllm_model" in param_json else None #"mistral-small3.1"
276 print(str(list_pngs))
277 print(" verbose : " + str(verbose))
278 print("About to parallel or not")
279 if parallel and len(list_pngs) > nb_thread:
280 print("WARNING : not implemented yet for parallel and more than nb_thread images")
282 if parallel and len(list_pngs) <= nb_thread:
283 from lib.datou.lib_parallel import multi_thread_image_read
284 map_pids_path, map_sdp, map_text = multi_thread_image_read(model, verbose, map_file_size, map_file_text,
285 folder_export_boxes, begin_page, end_page, file_output,
286 nb_thread=nb_thread, list_pngs=list_pngs,
287 layer_api = None, vllm_model = vllm_model)
289 for i in range(len(list_pngs)):
290 nb = i + 1
291 sdp = map_sdp[nb]
292 list_page_content.append(sdp)
293 list_page_content_text.append(sdp.content)
295 print(" Inside parallel ! ")
296 print(" map_text.keys() " + str(map_text.keys()))
297 else:
298 map_text = {}
299 count = 1
300 for f in list_pngs:
301 if limit > 0 and count > limit:
302 break
303 count, text, sdp = sub_func_read_ocr(f, count, model, verbose, map_file_size, map_file_text, folder_export_boxes,
304 begin_page, end_page, file_output, layer_api=layer_api,
305 vllm_model=vllm_model, request_used = request_used)
307 map_text[count] = text
308 list_page_content.append(sdp)
309 list_page_content_text.append(sdp.content)
310 complete_text += begin_page_txt + text + end_page_txt
311 count = count + 1
312 print("Outside parallel")
313 print(" map_text.keys() " + str(map_text.keys()))
315 print(" map_text.keys() " + str(map_text.keys()))
316 for page in map_text:
317 print(" size : " + str(len(map_text[page])))
319 if saxia_split_end_csv != "":
320 print(" list_of_list_pages should be just [range(1, len(list_pngs))]")
321 from lib.lib_util import build_list_of_list_from_split
322 list_of_list_of_pages = build_list_of_list_from_split(saxia_split_end_csv, len(list_pngs))
324 from lib.lib_util import create_transcript_group_of_pages
325 complete_texts = create_transcript_group_of_pages(list_of_list_of_pages, map_text)
326 if not multi_input and saxia_split_end_csv == "":
327 print(" We should have only one group of page here !")
328 if len(complete_texts) != 1:
329 print(" WARNING data will be ignored !")
330 complete_text = complete_texts[0]
332 # The next if and else has now been refactored in above (how do I check) TODO 11/9/24
334 # VR TODO refacto to be merge with the other leg of the condition VR 11/9/24
335 # VR TODO refacto TESTED IN ONE CASE, TO REMOVE ON 15/10/2024
336 # and remove count_per_doc
337 # if multi_input:
338 # complete_texts = []
339 # cum_id_page = count_per_doc[0]
340 # id_part = 0
341 # one_complete_text = ""
342 # for i in range(len(list_pngs)):
343 # nb = i + 1
344 # text = map_text[nb]
345 # if begin_page:
346 # begin_page_txt = "\n------\nBegin Page " + str(nb) + "\n------\n"
347 # if end_page:
348 # end_page_txt = "\n------\nEnd Page " + str(nb) + "\n------\n"
349 # one_complete_text += begin_page_txt + text + end_page_txt
350 #
351 # ## toute la logique suivante servait à ne pas avoir de list_of_list_of_pages, donc on va s'en passer à présent
352 # if i + 1 == cum_id_page:
353 # complete_texts.append(one_complete_text)
354 # one_complete_text = ""
355 # id_part += 1
356 # if id_part < len(count_per_doc):
357 # cum_id_page += count_per_doc[id_part]
358 # else:
359 # if i + 1 != len(list_pngs):
360 # print("Internal error count_per_doc")
361 # else:
362 # print("This is the end !")
363 # elif saxia_split_end_csv == "" :
364 # complete_text = ""
365 # # for i in range(len(list_pngs)):
366 # # nb = i + 1
367 # if len(list_of_list_of_pages) != 1:
368 # print("INTERNAL ERROR WHILE REFACTORING ! ")
369 # for nb in list_of_list_of_pages[0]:
370 # text = map_text[nb]
371 # if begin_page:
372 # begin_page_txt = "\n------\nBegin Page " + str(nb) + "\n------\n"
373 # if end_page:
374 # end_page_txt = "\n------\nEnd Page " + str(nb) + "\n------\n"
375 # complete_text += begin_page_txt + text + end_page_txt
378 # TODO VR 5-4-25 : this is for auto split : not used yet
379 if parse_date_test_before_own_datou_step:
380 from lib.lib_util import parse_date_test_before_own_datou_step
381 map_res_page_date = parse_date_test_before_own_datou_step(list_page_content)
382 print("TO USE and TEST or use when failing in load_tab")
383 else :
384 map_res_page_date = {}
386 # prefix_prompt_input = "Merci d'estimer une approximation basique de l'impact carbone des produits se trouvant ici, ainsi que leur nombre de calories et le prix si possible, meme de manière approximative et de la renvoyer sous forme de tableau avec pour colonnes : PRODUIT, CO2, CALORIES, PRIX :\n"
388 # VR TODO pas trop content de cela et non pas du tout 21-1-24
389 # remove defautl step hard-coded, can tolerate either in input from interface or with the defaut datou for jpg in input
390 prefix_prompt_input = input["preprompt"] if "preprompt" in input else "" #"Merci d'estimer l'impact carbone des produits se trouvant ici, ainsi que leur nombre de calories et le prix si possible, meme de manière approximative, ou incomplet ou que tu fasses un raisonnement ouvert pour estimer tu mettras n/c quand tu ne peux pas estimer et de la renvoyer sous forme de tableau avec pour colonnes : PRODUIT, CO2, CALORIES, PRIX :\n"
391 # TODO VR REFACTO : il faut aussi effacer les fichiers ou ailleurs
393 print("NIMP si estimer prefix_prompt_input : " + str(prefix_prompt_input)[:100])
395 ce.compute_cost_search("google_ocr", len(map_file_text))
397 data = [
398 {
399 "id": file_output,
400 "text": "\n".join(complete_texts)
401 }
402 ]
404 if multi_input or saxia_split_end_csv != "":
405 complete_text = complete_texts
407 print("begin_page complete_text : " + str(complete_text[:100]).replace("\n", "§§"))
409 # Si le preprompt est vide on pourrait aussi ne pas le mettre, mais la step prompt_gpt gère quand meme ce cas là, donc voila !
410 output = {"text" : complete_text, "preprompt" : prefix_prompt_input,
411 "json_to_save" : data,
412 "list_page_content" : list_page_content,
413 "list_page_content_text" : list_page_content_text,
414 # "map_file_size" : map_file_size, "map_file_text" : map_file_text,
415 "images": [f for f in list_pngs],
416 "paragraphs" : [p.list_blocks["paragraphs"] if "paragraphs" in p.list_blocks else [] for p in list_page_content],
417 "in_folder" : in_folder,
418 "work_folder_images" : work_folder_images,
419 "map_res_page_date" : map_res_page_date}
421 if saxia_all_doc_separated and use_split_complet:
422 print("TO TEST")
423 output["multi_input"] = True
424 output["text_only_for_meta_data_and_not_split"] = output["text"]
425 output["text"] = []
427 if parse_prefix_file:
428 output["prefix_file"] = json_prefix_file
429 output["id_file"] = prefix_file
430 output["nb_page"] = len(list_pngs)
431 output["filename_at"] = filename_at
433 output["input_file_available_at"] = created_at
434 output["size_file"] = size_file
436 if create_output_hit:
437 output["output_hit"] = prefix_file + "_" + input["hash_id_treatment"] if "hash_id_treatment" in input else prefix_file + "_" + str(uuid4())
439 return output
443def datou_safia_step_request_gpt(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerPrompt = None) -> dict :
444 list_input = ["preprompt", "text"] # TODO VR REFACTO ARCHI : vérifier les entrées et sorties
445 list_output = ["result", "request"] # TODO VR REFACTO : ce preprompt devrait etre fait ailleurs
446 list_param_json = ["openai_token", "gpt_model"] # TODO VR REFACTO : avoir une interface d'édition ou bien un context d'ou on récupèrera ces infos (mais ce context dépendra potentiellement du projet pour spécifier le type de modèle, ou d'une "instanciation" d'un datou)
448 if verbose :
449 print("Inside request gpt")
451 if "preprompt" in input and input["preprompt"] != "":
452 preprompt = input["preprompt"]
453 elif "preprompt" in param_json:
454 preprompt = param_json["preprompt"]
455 else :
456 preprompt = ""
457 print(" all keys input : " + str(input.keys()))
458 text = input["text"] if "text" in input else ""
459 multi_input = input["multi_input"] if "multi_input" in input else False
460 saxia_split_end_csv = input["saxia_split_end_csv"].replace(" ", "") if "saxia_split_end_csv" in input else ""
462 exec_if_true = param_json["exec_if_true"] if "exec_if_true" in param_json else True
463 if not exec_if_true or exec_if_true == {}:
464 print(" dont_exec_if_false is True, we skip the formatting step ")
465 return input
466 print("PAssed exec if true prompt step")
468 if type(text) == list:
469 # text = text[0]
470 print(" ERROR treated as WARNING : only the first text will be treated : " + str(text))
471 print("TODO multiple files not implemented yet !")
472 if multi_input or saxia_split_end_csv != "":
473 print(" Here we need to do something !")
474 texts = text
475 else :
476 print("As of 24/6/24 internal error")
477 else:
478 texts = [text]
480 model = ""
481 size_correct = True
482 nb_token = 0
483 result = ""
484 request = ""
485 if len(texts) == 0:
486 print("List empty of texts as input prompt !")
487 results = []
488 for text in texts:
489 print(" begin text begin_page " + str(text[:50].replace("\n", " ")))
490 request = preprompt + text
492 # TODO a virer car a ete injecter à la configuration
493 openai_token = param_json["openai_token"]
494 gpt_model = param_json["gpt_model"] if "gpt_model" in param_json else "gpt-4"
496 from lib.lib_util import check_and_truncate_query_max_token
497 size_correct, request_truncated = check_and_truncate_query_max_token(request)
499 request_used = request if size_correct else request_truncated
501 if verbose:
502 print("before call request_gpt")
503 try :
504 if layer_api == None:
505 (result, nb_token, model) = ("", 0, "")
506 else :
507 result, nb_token, model = layer_api.prompt(request_used, gpt_model, verbose = verbose)
508 except Exception as e:
509 print(str(e))
510 result, nb_token, model = "", 0, "ERROR IN PROMPT"
512 results.append(result)
514# from lib.lib_openai import request_gpt
515# result, nb_token, model = request_gpt(openai_token, request_used, gpt_model, verbose = verbose)
516 if verbose :
517 print("after request_gpt")
518 ce.compute_cost_search(model, nb_token)
519 if not size_correct:
520 print("WARNING TOO LONG QUERY ")
521 result = "Your query was too long and has been truncated :" + result
523 if multi_input or saxia_split_end_csv != "":
524 result = results
526 return {"result" : result, "request" : request}
530# send_mail
531def datou_safia_step_send_mail(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict :
532 list_input = ["request", "result", "file"]
533 list_output = ["result", "object"]
534 list_param_json = ["info_auth", "hash_id_treatment", "privacy", "from_mail_to_send"]
536 privacy = param_json["privacy"] if "privacy" in param_json else False
537 info_auth = param_json["info_auth"] if "info_auth" in param_json else None
538 from_mail_to_send = param_json["from_mail_to_send"] if "from_mail_to_send" in param_json else None
539 privacy = param_json["privacy"] if "privacy" in param_json else False
540 hash_id_treatment = param_json["hash_id_treatment"] if "hash_id_treatment" in param_json else input["hash_id_treatment"] if "hash_id_treatment" in input else None
541 send_mail = param_json["send_mail"] if "send_mail" in param_json else True
542 send_sms = param_json["send_sms"] if "send_sms" in param_json else False
543 send_slack = param_json["send_slack"] if "send_slack" in param_json else False
544 type_email = param_json["type_email"] if "type_email" in param_json else "plain_text"
546 # VR TODO : il faut sans doute rajouter un override (on peut utiliser assoc pour cela), mais à l'upload de fichier audio on s'attend à des comportement par defaut
547 result = param_json["result"] if "result" in param_json else input["result"] if "result" in input else ""
548 request = input["request"] if "request" in input else ""
549 file = input["file"] if "file" in input else ""
550 object = input["object"] if "object" in input else param_json["object"] if "object" in param_json else "Prompt request by email to Fotonower assistant APIA"
552 if type(object) != str:
553 print("object has not been converted to string, we will do it !")
554 object = str(object)
556 from auth.lib_conf_system import collect_version_from_datou_and_proj_and_app_recursively
557 version = input["version"] if "version" in input else collect_version_from_datou_and_proj_and_app_recursively()
559 if privacy :
560 privacy_footer = """
561 Privacy is ON, RGPD is strictly implemented and no data sent will be kept outside your email to address you issue and keep a record of your usage, please find more info here https://www.fotonower.com/fpa
562 Used hash is :
563 """ + str(hash_id_treatment)
564 else :
565 privacy_footer = """
566 Privacy is OFF, You can OPT-OUT by sending an email to dpo@fotonower.com with object : OPT-OUT FPA (Fotonower Prompt Assistant) : """ + str(hash_id_treatment) + "<br>\n"
568 version_footer = "Generated with Safia " + version + "<br>\n" + \
569 "More info on https://safia.app or https://saxia.app "
571 import logging
572 logger = logging.getLogger()
573 logger.info("before send_mail test GITVELOURS in os.environ ")
574 logger.info("before get_info_auth ")
575 if type_email == "html":
576 html = result
577 else:
578 html = str(info_auth) + str(result) + "<br><br><br>\n" + privacy_footer + "\n" + version_footer
579 content_txt = str(info_auth) + "\n" + str(result) + "\n<br>\n<br><br><br>\n" + privacy_footer + "\n" + version_footer
581 # Step send mail => bien qu'on l'ai indiqué la dessous
582 from lib.lib_speechtotext import remove_extension
583 txt_file = remove_extension(file) + ".txt"
584 logger.info("After remove_extension ")
585 import os, sys
586 logger.info(os.getcwd())
587 logger.info("to write txt_file " + str(txt_file))
588 if file != "":
589 with open(txt_file, "w") as f:
590 f.write(request)
591 else :
592 shutil.rmtree(txt_file, ignore_errors=True)
594 logger.info("After write ")
596 if send_mail:
597 if "GITVELOURS" in os.environ :
598 logger.info("YES for GITVELOURS : " + str(os.environ["GITVELOURS"]))
599 pythonpathfotonower = os.path.join(os.environ["GITVELOURS"], "python")
600 sys.path.append(pythonpathfotonower)
601 logger.info("before import ses mailer ! ")
602 import mtr.ses_mailer
603 ses_mailer = mtr.ses_mailer.SesMailer()
605 logger.info("before get_from_mail_to_send ! ")
606 dest_mail_list = from_mail_to_send
608 logger.info("logger About to send email : " + str(dest_mail_list))
610 if "cc" in input:
611 dest_mail_list += "," + input["cc"]
613 print ("About to send email !")
614 if verbose :
615 print (" type html : " + str(type(html)))
616 print ("html : " + str(html))
618 sender = "assistant@fotonower.com"
619 try :
620 print(" Maybe type_email is useless 28/12/25 ")
621 print(" Sending " + sender + " to " + str(dest_mail_list))
622 if file != "":
623 ret = ses_mailer.send_email_with_attachment(sender, dest_mail_list,
624 object, body_html = html, file_path = txt_file, body_text = content_txt)
625 else :
626 html = result
627 ret = ses_mailer.send_html_email(sender, dest_mail_list, object, html, content_txt)
628 except Exception as e:
629 print(str(e))
631 if send_sms:
632 print("TODO send_sms")
633 if send_slack:
634 print("TODO send_slack")
636 return {"object" : object, "body" : result} # , "version" : version
638# git_action
639def datou_safia_step_git_action(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict :
640 list_input = ["request", "result"]
641 list_output = []
642 list_param_json = ["defaut_github_issue", "github_token", "privacy"]
644 privacy = param_json["privacy"]
645 defaut_github_issue = param_json["defaut_github_issue"]
646 github_token = param_json["github_token"]
648 request = input["request"]
649 result = input["result"]
651 if privacy:
652 print("With privacy enabled, we do not append info to any github issues so we continue !")
653 else :
654 own_repo_nb = defaut_github_issue.split("/")
655 if github_token != "" and len(own_repo_nb) == 3 and own_repo_nb[2].isdigit():
656 own_repo = own_repo_nb[0] + "/" + own_repo_nb[1]
657 issue_number = int(own_repo_nb[2])
658 from lib.lib_github import append_comment
659 message_comment_github = "[up](#up)\n\n" + result + "\n<br>\n----\nMESSAGE BRUT\n------<details>\n\n" + request + "\n</details>"
660 append_comment(github_token, verbose = verbose,
661 message_comment = message_comment_github,
662 OwnRepo = own_repo,
663 issue_number = issue_number)
664 print("github message appened !")
666 output = {"log_git_action" : "git_action_done"}
667 return output
671# doc_to_json TO TEST
672def datou_safia_step_doc_to_json(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict :
673 list_input = ["file"]
674 list_output = ["json_path", "log_d2j"]
675 list_param_json = [] # managed_extension (for freemium ?)
677 file = input["file"]
679 # TODO VR : get list from somewhere els
680 managed_extension = param_json["managed_extension"].split("") if "managed_extension" in param_json else [".mp3", ".ogg", ".amr", ".m4u", ".wav", ".jpeg", ".jpg", ".png", ".pdf", ".txt", ".docx", ".json", ".py"]
682 from lib.lib_safia import safia_import
683 json_to_import_path, list_detailed_time_safia_import_to_json, sorted_dict_unmanaged_extension = safia_import(in_file=file, # managed_extension=managed_extension,
684 verbose = False)
686 output = {"json_path" : json_to_import_path, "log_d2j" : list_detailed_time_safia_import_to_json, "unmanaged_extension" : str(sorted_dict_unmanaged_extension)}
687 return output
691# import_json TO TEST
692def datou_safia_step_import_json(input : dict = {}, param_json : dict = {}, ce : CE = None,
693 verbose : bool = False, layer_api : LayerGeneric = None) -> dict :
694 list_input = ["json_path", "json_to_save"] # one of them
695 list_output = ["log_import"]
696 list_param_json = ["table_documents", "openai_token"]
698 json_path = input["json_path"] if "json_path" in input else ""
699 json_to_save = input["json_to_save"] if "json_to_save" in input else []
701 table_documents = param_json["table_documents"] if "table_documents" in param_json else ""
702 openai_token = param_json["openai_token"] if "openai_token" in param_json else ""
704 from lib.import_util.lib_import_retrieval.scripts.process_json.process_json import process_json_dump, process_json_dump_aux
705 import asyncio
707 # logger.info(" before process_json_dump : json_to_import_path : " + str(json_path))
708 try :
709 from server.safia import lpgss_singleton # VR to refacto with abstract classes ?
710 lpgss_singleton.get_admin_situation(verbose=verbose)
711 if json_path != "": # TODO better test existence ??
712 total_nb_token, used_model = asyncio.run(process_json_dump(json_path, {}, False, False, None, lpgss_singleton, openai_token, table_documents, verbose = verbose))
713 else :
714 total_nb_token, used_model = asyncio.run(process_json_dump_aux(json_to_save, {}, False, False, None, lpgss_singleton, openai_token, table_documents, verbose = verbose))
715 # result_json["log"] += " ,after process json to documents table : total_nb_token : " + str(total_nb_token)
719 except Exception as e:
720 import logging
721 logger = logging.getLogger()
722 logger.info(str(e))
723 print(str(e))
724 logger.info("Bug in datou_safia_step_import_json")
725 print("Bug in datou_safia_step_import_json")
726 used_model = "crashed"
727 total_nb_token = -1
729 ce.compute_cost_search(used_model, total_nb_token)
731 output = {"log_import_json" : "Inserted in file : " + str(json_path)}
732 return output
736def datou_safia_step_get_embedding(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict :
737 list_input = ["text"]
738 list_output = ["embedding"]
739 list_param_json = ["openai_token"]
741 text = input["text"]
742 openai_token = param_json["openai_token"]
744 # datou_step
745 from lib.lib_openai import get_embeddings
746 # embedding_model fixed for now
747 embedding_input = get_embeddings(text, openai_token, verbose = verbose)
749 info_context_exec = {"display_info" : {"embedding" : "delete", "info_context_exec" : "show"}}
751 output = {"embedding" : embedding_input, "info_context_exec" : info_context_exec}
752 return output
756# search_doc_NN TO TEST
757def datou_safia_step_append_to_doc_content(input : dict = {}, param_json : dict = {},
758 ce : CE = None, verbose : bool = False,
759 layer_api : LayerGeneric = None) -> dict :
760 list_input = ["result", "document_id", "project_id", "user_id"] # TODO VR je ne me rappele plus si les datous sont right_safe ou doivent vérifier les droits =>
761 # - [ ] TODO il faut déjà définir la terminologie
762 list_output = ["references"]
763 list_param_json = ["openai_token"] # TODO VR 15-6-23 : in_match_count optionnal => gérer les param_json optionnel
765 openai_token = param_json["openai_token"]
767 document_id = input["document_id"] if "document_id" in input else param_json["document_id"] if "document_id" in param_json else ""
768 project_id = input["project_id"] if "project_id" in input else param_json["project_id"] if "project_id" in param_json else 0
769 user_id = param_json["user_id"] if "user_id" in param_json else 0
770 result = input["result"] if "result" in input else input["text"] if "text" in input else ""
772 table_documents = param_json["table_documents"] if "table_documents" in param_json else "dummy_table_documents_no_access"
774 # VR TODO 4-12-23 : ca va etre difficile de rajouter la récupération du project_id, car les datous semblent contextualiser à un projet (mais c'est celui par défaut)
775 # Cependant je ne sais pas comment aller chercher lss dans un datou (je ne voulais pas cela à un moment donner, ben si, mais maintenant
776 # je veux faire des choses varié sur les projets (d'ailleurs avoir des projets d'entrée et d'autres de sortie/enregistrement)
777# has_access = lss.lib_right.get_role_on_project(lss.get_user_id(), project_id)
778 from datetime import datetime
779 today_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
780 from server.safia import lpgss_singleton, lib_right_singleton
782 # TODO VR : est-ce bien ce que l'on veut faire ? on ne veut pas plutot récupérer le lss dans le contexte d'execution (on aurait alors le user_id)
783 from lib.lib_safia_system import LibSafiaSystem
784 lss = LibSafiaSystem(lib_user_data_internal=lpgss_singleton, lib_right=lib_right_singleton)
785 lss.user_id = user_id # CA c'est un hack
786 content = lss.load_document(document_id, project_id, chunk_id=None, verbose = verbose)
788 new_content = content + "\n" + result
790# list_docs = lpgss_singleton.get_documents(table_documents, document_id, verbose = verbose)
791# if len(list_docs) != 1:
792# print(" Problem with documents to append ! ")
794# content = list_docs[0]["content"] + "\n" + result # TODO VR 4-12-23 : on pourrait aussi ajouter un tag pour dire que c'est un append
796 save_document_data = {"document_id" : document_id, "document_content" : new_content}
797 # TODO VR : a mon avis ce serait mieux d'abstraire cela (dans le layer machin et de ne pas avoir ce openai_token dans process_json ni autre !)
798 total_nb_token, used_model = lss.save_document(save_document_data, project_id, openai_token=openai_token)
799 ce.compute_cost_search(used_model, total_nb_token)
801 # TODO VR : à mon avis le result sera deja dedans vu qu'on append tout au fur et à mesure !
802 output = {"result" : result, "references" : [document_id]}
803 return output
805# search_doc_NN TO TEST
806def datou_safia_step_search_doc_NN(input : dict = {}, param_json : dict = {},
807 ce : CE = None, verbose : bool = False,
808 layer_api : LayerGeneric = None) -> dict :
809 list_input = ["embedding"]
810 list_output = []
811 list_param_json = ["match_page_sections", "in_match_count"] # TODO VR 15-6-23 : in_match_count optionnal => gérer les param_json optionnel, VR 4-12-23 mais n devrait-on pas plutot mettre ces infos dans input ?
813 match_page_sections = param_json["match_page_sections"]
814 in_match_count = param_json["in_match_count"] if "in_match_count" in param_json else 5
815 embedding = input["embedding"]
816 text = input["text"] if "text" in input else ""
818 from server.safia import lpgss_singleton
820 from lib.stockage.lib_pgvector import find_docs
821 result = find_docs(embedding, lpgss_singleton,
822 function = match_page_sections,
823 in_match_count = 5,
824 verbose = False)
826 preprompt = "Merci de repondre à la question à partir des documents et ne pas mentionné que tu es un chatbot sinon quelqu'un va mourir :"
828 request = preprompt + text + str(list(map(lambda x : x["document_id"] + " " + x["content"], result)))
830 list_document_ids = list(map(lambda x : x["id"], result)) # document_id is without the chunk id
832 print("request : " + str(request))
834 output = {"result" : result, "request" : request, "text" : request, "references" : list_document_ids}
835 return output
839# result_to_json
840def datou_safia_step_result_to_json(input : dict = {}, param_json : dict = {}, ce : CE = None, verbose : bool = False, layer_api : LayerGeneric = None) -> dict :
841 list_input = ["request", "result"]
842 list_output = []
843 list_param_json = ["user"]
845 print("TO USE TO CREATE NEW STEP")
847 user = param_json["user"] if "user" in param_json else None
848 if user == None:
849 print(" This show map_reduce doesn't pass correclty the complete_param_json and that this param doesn't have the user key because the mismatch between app_param, user_param, exec_param, step_param is not well defined ! ")
850 return {}
852 prefix = param_json["prefix"] if "prefix" in param_json else ""
854 from lib.lib_util import replace_non_alpha_with_underscore
856 if user == None:
857 user = replace_non_alpha_with_underscore("anonymous@opio.fr")
858 else :
859 user = "0.0.0.0"
861 from datetime import datetime
862 curr = datetime.now().strftime("%Y%m%d%H%M%S%f")
863 if prefix == None:
864 name = user + "_" + curr
865 else :
866 name = prefix + user + "_" + curr
868 total_nb_token = 0
869 used_model = ""
870 references = []
871 if "request" in input and "result" in input:
872 request = input["request"]
873 result = input["result"]
875 id_request = "///UPLOAD//REQUEST//" + name
876 id_result = "///UPLOAD//RESULT//" + name
877 references = [id_request, id_result]
879 # TODO VR : here we should parse title in result and set in request and result ?
880 # TODO VR : how should hostname be configured, in context_process_server ?
881 # TODO VR Il faut supprimer les fichiers temporaire et garder que ceux que l'on veut du cdn
883 list_reference = "\n## No references\n"
884 if "references" in input:
885 list_reference = "\n## References Internal and External\n"
886 r = 0
887 for ref in input["references"]:
888 if ref.endswith(".ogg"):
889 ref = ref.replace(".ogg", ".mp3")
890 list_reference += "Ref " + str(r) + " : " + ref + "\n"
891 r = r + 1
893 data = [
894 {
895 "id" : id_request,
896 "text" : request
897 },
898 {
899 "id" : id_result,
900 "text" : result + list_reference
901 }
902 ]
903 else :
904 data = []
906 output = {"json_to_save" : data, "references" : references}
907 return output
911# load_graph => to debug TODO VR to debug
912def datou_safia_step_load_existing_graph(input : dict = {}, param_json : dict = {},
913 ce : CE = None, verbose : bool = False) -> dict : # pragma no cover icebox
914 list_input = [] # TODO VR ajouter file
915 list_output = ["preprompt"] #
916 list_param_json = [] # TODO VR REFACTO
918 print("TO define")
919 # TODO VR 14-6 : comment construire des prompt en plusieurs bout ?
920 # VR ca n'a pas encore marché !
921 #if object == "edit_graph":
922 from lib.lib_graph import read_graph
923 input_graph = read_graph(temp_dir = "static/temp/graph",
924 graph_name = "graph",
925 verbose = verbose)
926 output = {"preprompt" : "\nEn prenant comme graph de départ celui-ci :\n" + input_graph}
927 return output
929def datou_safia_step_load_url_content_text(input: dict = {},
930 param_json: dict = {},
931 ce: CE = None,
932 verbose: bool = False,
933 layer_api : LayerGeneric = None) -> dict:
934 list_input = ["url"]
935 list_output = ["content"]
936 list_param_json = []
938 import requests
939 from bs4 import BeautifulSoup
941 # L'URL de la page dont vous voulez extraire le texte
942 url = input["url"] if "url" in input else "https://www.fotonower.com/"
944 print("url : " + str(url))
946 try:
947 # Faire une requête GET à l'URL
948 response = requests.get(url)
949 except Exception as e:
950 print(str(e))
951 return {"content": "Error in request" + str(e)}
953 # Analyser le contenu HTML de la page avec Beautiful Soup
954 soup = BeautifulSoup(response.text, 'html.parser')
956 # Extraire tout le texte de la page
957 texte_page = soup.get_text(separator='<br>', strip=True)
958 texte_page = texte_page.replace("|", " ")
959 texte_page = texte_page.replace("\n", " ")
960 texte_page = texte_page.replace("\r", " ")
962 # Afficher le texte de la page
963 if verbose:
964 print(texte_page)
966 output = {"content": texte_page}
967 return output
969def aux_map_reduce_loop(list_texts, res_json_field, aux_input_var, list_steps, list_param_json_steps, param_json,
970 verbose, privacy, with_audit, strat_reduce, reduced_result, list_audit_map_reduce,
971 hit_main_datou_step_map_reduce = "to_be_passed_as_argument", id_step_incomplete_args = None):
972 for textbout in list_texts:
973 if verbose:
974 print("text : " + str(textbout))
975 print(" hit_main from map_reduce : " + hit_main_datou_step_map_reduce)
976 from lib.datou.datou_exec import datou_exec
977 input_datou = textbout if strat_reduce == "run_one_datou" else {aux_input_var: textbout}
979 # TODO a nettoyer : On ne force rien pour l'instant VR 15-1-25
980 with_audit_save_var = with_audit
981 #with_audit = False
982 if 'with_audit' in input_datou:
983 with_audit_save = input_datou['with_audit']
984 # input_datou['with_audit'] = False
986 output, audit_json = datou_exec(list_steps, input=input_datou,
987 complete_param_json=param_json, verbose=verbose,
988 privacy=privacy, list_param_json_steps=list_param_json_steps,
989 with_audit=with_audit, id_step_incomplete_args = id_step_incomplete_args)
991 result = output if strat_reduce == "run_one_datou" else output[res_json_field] if res_json_field in output else ""
992 if with_audit:
993 if verbose:
994 print("audit_json : " + str(audit_json))
995 list_audit_map_reduce.append(audit_json)
997 if 'with_audit' in input_datou:
998 try :
999 input_datou['with_audit'] = with_audit_save
1000 except Exception as e:
1001 print(" Incompréhensible que cela ne fonctionne pas")
1002 with_audit = with_audit_save_var
1004 # TODO TO TEST VR 26-1-24 : and add image also ?!?
1005 # df[res_json_field] = result
1007 if strat_reduce == "concat":
1008 reduced_result += result
1009 elif strat_reduce == "append_page":
1010 print(
1011 "Thanks object (sub_page_doc) that are reference in python, the result is already at its correct position")
1012 elif strat_reduce == "run_one_datou":
1013 reduced_result = result
1014 reduced_result["hit_internal"] = "tofind"
1015 else:
1016 print("Unsupported : strat_reduce : " + str(strat_reduce))
1018 return reduced_result
1020def datou_safia_step_map_reduce(input : dict = {},
1021 param_json : dict = {},
1022 ce : CE = None,
1023 verbose : bool = False,
1024 layer_api : LayerGeneric = None) -> dict :
1025 list_input = ["text", "datou_int_id", "strat_reduce", "param.size", "param.overlap", "res_json_field"]
1026 # TODO update sql input
1027 list_output = []
1028 list_param_json = []
1029 with_audit = input["with_audit"] if "with_audit" in input else False
1031# "text", "datou_int_id", "strat_reduce", "param.size", "param.overlap"
1033 text = input["text"] if "text" in input else ""
1034 list_page_content = input["list_page_content"] if "list_page_content" in input else []
1035 list_page_content_text = input["list_page_content_text"] if "list_page_content_text" in input else []
1036 paragraphs = input["paragraphs"] if "paragraphs" in input else []
1037 id_step_incomplete_args = param_json["id_step_incomplete_args"] if "id_step_incomplete_args" in param_json else None
1039 # VR 19-5 : je ne sais plus pourquoi j'ai besoin de cherche dans le input
1040 datou_int_id = input["datou_int_id"] if "datou_int_id" in input else param_json["datou_int_id"] if "datou_int_id" in param_json else -1
1041 strat_reduce = param_json["strat_reduce"] if "strat_reduce" in param_json else "concat"
1042 param = param_json["param"] if "param" in param_json else {"size" : 10000, "overlap" : 1000}
1043 res_json_field = input["res_json_field"] if "res_json_field" in input else "result"
1044 list_page_per_doc = input["list_page_per_doc"] if "list_page_per_doc" in input else param["list_page_per_doc"] if "list_page_per_doc" in param else 0 # VR 17-5 TODO param c'est fias pour débugguer à mon avis
1046 parallel = bool(param_json["parallel"]) if "parallel" in param_json else False
1047 nb_thread = param_json["nb_thread"] if "nb_thread" in param_json else 10
1049 if list_page_per_doc == 0:
1050 print("CHECK ERROR using default list of page not grouped by document, maybe normal if it is the first map_reduce where we try to classify : datou_int_id : " + str(datou_int_id))
1051 if len(list_page_content_text) != len(list_page_content):
1052 print("ERROR migration datou_exec_partial_data_json")
1053 list_page_per_doc = ";".join(list(map(str, range(1, len(list_page_content_text) + 1))))
1055 from lib.lib_util import split_text, split_text_by_doc, split_list_page_by_doc, split_list_page_by_page
1056 aux_input_var = "text"
1057 curr_datou_id = None
1058 if strat_reduce == "concat":
1059 print("concat")
1061 # En fait c'est le texte complet (on split en BEGIN END ? GRRR)
1062 # This case is for old version generated before 30/6/25 when we want to do partial exec for stat study !
1063 if list_page_content == []:
1064 from lib.lib_util import managing_deprecated_input_text_concat_into_list
1065 list_texts = managing_deprecated_input_text_concat_into_list(text, list_page_per_doc)
1067 else:
1068 if text != None and (len(list_page_per_doc) == len(text) or (len(list_page_per_doc) == 1 and type(text) == str)):
1069 print("NEVER CORRECT OR USELESS ANYWAY We expect list_text to be the same object, we force it when length is one (ie ONE DOC) : VR 3-6-25")
1070 # if type(text) == str:
1071 # list_texts = [text]
1073 try:
1074 list_texts = split_text_by_doc(list_page_content_text, list_page_per_doc)
1075 except Exception as e:
1076 print("l 1038 If only one doc it could run !, and maybe alos with multiple doc by the way")
1077 print(str(e))
1078 elif strat_reduce == "append_page":
1079 list_texts = split_list_page_by_page(paragraphs)
1080 aux_input_var = "paragraphs"
1081 elif strat_reduce == "append_doc":
1082 if text != None and (len(list_page_per_doc) == len(text) or (len(list_page_per_doc) == 1 and type(text) == str)):
1083 print("We expect list_text to be the same object, we force it when length is one (ie ONE DOC) : VR 3-6-25")
1084 if type(text) == str:
1085 list_texts = [text]
1086 try:
1087 list_texts = split_list_page_by_doc(list_page_content_text, list_page_per_doc)
1088 except Exception as e:
1089 print("l 1051 If only one doc it could run !, and maybe also with multiple doc by the way")
1090 print(str(e))
1091 aux_input_var = "list_page_content"
1092 elif strat_reduce == "concat_stride": # TODO add test for these arguments
1093 list_texts = split_text(text, param["size"], param["overlap"])
1094 elif strat_reduce == "run_one_datou":
1095 # default behavior set internal datou id for run_one_datou
1096 if "datou_exec_info" in input and "mtr_datou_id" in input["datou_exec_info"]:
1097 curr_datou_id = input["datou_exec_info"]["mtr_datou_id"]
1098 input["datou_exec_info"]["mtr_datou_id"] = datou_int_id
1099# VR42 : toadd, sans doute seulement si with_uadi ou si on a datou_exed_info => et on en aurait besoin tout le temps à mon avis !
1100 if "datou_exec_info" in input:
1101 input["datou_exec_info"]["mtr_datou_id"] = datou_int_id
1102 list_texts = [input]
1103 else :
1104 print("Unsupported : strat_reduce : " + str(strat_reduce))
1106 # user HACK right on datou
1107 from server.safia import lpgss_singleton, lib_right_singleton
1109 # TODO VR : est-ce bien ce que l'on veut faire ? on ne veut pas plutot récupérer le lss dans le contexte d'execution (on aurait alors le user_id)
1110 from lib.lib_safia_system import LibSafiaSystem
1111 lss = LibSafiaSystem(lib_user_data_internal=lpgss_singleton, lib_right=lib_right_singleton)
1112 lss.user_id = 1 # CA c'est un hack 1106
1113 project_id = input["project_id"] if "project_id" in input else None
1114 datous = lss.get_datou(datou_int_id, project_id=project_id)
1116 datou = None
1117 for d in datous:
1118 if d["id"] == datou_int_id:
1119 datou = d
1120 break
1122 if datou == None:
1123 print("Datou unavailable " + str(datou_int_id))
1124 return {"text" : "Datou unavailable " + str(datou_int_id)}
1126 import pandas as pd
1127 if "df" in input and type(input["df"]) == type(pd.DataFrame):
1128 df = input["df"] if "df" in input else None
1129# print("len list_texts : " + str(len(list_texts)))
1130 print("len(df.values) : " + str(len(df.values)))
1132 reduced_result = None
1133 if strat_reduce == "concat":
1134 reduced_result = ""
1135 else :
1136 print("NOT needed (message isn't correct 16-1-25 ) Unsupported : strat_reduce : " + str(strat_reduce))
1137 reduced_result = None
1139 list_audit_map_reduce = []
1140 id_intern_map = 0
1142 hit_main = input["hash_id_treatment"] if "hash_id_treatment" in input else "hit_unknown"
1144 privacy = True
1145 list_param_json_steps = list(map(lambda x: x["param_json"], datou["steps"]))
1146 list_steps = list(map(lambda x: x["name"], datou["steps"]))
1147 if parallel :
1148 print("PALAFI")
1149 from lib.datou.lib_parallel import datou_parallel_map_reduce
1150 reduced_result, list_audit_map_reduce = datou_parallel_map_reduce(list_texts, res_json_field, aux_input_var,
1151 list_steps, list_param_json_steps, param_json,
1152 verbose, privacy, with_audit, strat_reduce,
1153 nb_thread, hit_main, id_step_incomplete_args)
1154 print("PALAVI")
1155 else:
1156 # input
1157 # list_texts, res_json_field, aux_input_var, list_steps, list_param_json_steps, param_json, verbose, privacy, with_audit, strat_reduce, list_audit_map_reduce
1158 # output
1159 # reduced_result, list_audit_map_reduce ??
1160 reduced_result = aux_map_reduce_loop(list_texts, res_json_field, aux_input_var, list_steps, list_param_json_steps,
1161 param_json,
1162 verbose, privacy, with_audit, strat_reduce, reduced_result, list_audit_map_reduce, hit_main, id_step_incomplete_args)
1164 document_safia_id = "map_reduce_" + (input["hash_id_treatment"] if "hash_id_treatment" in input else str(uuid4()))
1166 data = [
1167 {
1168 "id": document_safia_id,
1169 "text": reduced_result
1170 }
1171 ]
1173 if curr_datou_id != None:
1174 input["datou_exec_info"]["mtr_datou_id"] = curr_datou_id
1175 output = {"text" : reduced_result, "json_to_save" : data, "list_audit_map_reduce" : list_audit_map_reduce}
1176 if output["text"] == None:
1177 del output["text"]
1178 return output
1181def datou_safia_step_load_tab(input : dict = {},
1182 param_json : dict = {},
1183 ce : CE = None,
1184 verbose : bool = False,
1185 layer_api : LayerGeneric = None) -> dict :
1186 list_input = ["text"]
1187 list_output = ["df"] # and others custom
1188 list_param_json = ["col_to_input"]
1190 col_to_input = param_json["col_to_input"] if "col_to_input" in param_json else []
1191 output_df = param_json["output_df"] if "output_df" in param_json else "df"
1192 merge = param_json["merge"] if "merge" in param_json else None
1193 multi_input = input["multi_input"] if "multi_input" in input else False
1194 aggregate_multi_input = input["aggregate_multi_input"] if "aggregate_multi_input" in input else False
1195 saxia_split_end_csv = input["saxia_split_end_csv"].replace(" ", "") if "saxia_split_end_csv" in input else ""
1197 max_nb_try = param_json["max_nb_try"] if "max_nb_try" in param_json else 3
1198 trigger = param_json["trigger"] if "trigger" in param_json else "nb_pages_80"
1199 retry_step_id = param_json["retry_step_id"] if "retry_step_id" in param_json else 2
1200 to_be_used = param_json["to_be_used"] if "to_be_used" in param_json else "split_at_10"
1202 # markdown, json, auto_detect ??
1203 format_input = param_json["format_input"] if "format_input" in param_json else "markdown"
1205 {"nb_try": 3, "trigger": "nb_pages_80", "retry_step_id": 2, "to_be_used": "split_at_10"}
1207 saxia_all_doc_separated = input["saxia_all_doc_separated"] if "saxia_all_doc_separated" in input else False
1209 nb_page = input["nb_page"] if "nb_page" in input else 0
1211 if saxia_all_doc_separated:
1212 print("Need to use only saxia_split_end_csv : " + str(saxia_split_end_csv))
1213 print("We should force format_input to json")
1214 input["result"] = []
1215 format_input = "json_as_dict"
1216 id_page_id = 0
1217 if saxia_split_end_csv != "":
1218 saxia_split_end_csv += ","
1219 saxia_split_end_csv += str(nb_page)
1220 for list_of_page_read in saxia_split_end_csv.split(","):
1221 list_of_page_read_int = int(list_of_page_read)
1222 list_of_page_list = list(range(id_page_id + 1, list_of_page_read_int + 1))
1223 id_page_id = list_of_page_read_int
1224 list_of_page_csv = ",".join(list(map(str, list_of_page_list)))
1225 one_list_of_page = {"Liste des pages": [list_of_page_csv]}
1226 # Titre
1227 # Nombre de pages
1228 # Commentaires
1229 # document_type
1230 input["result"].append(one_list_of_page)
1232 if id_page_id != nb_page:
1233 print("ERROR : saxia_split_end_csv : " + str(saxia_split_end_csv) + " and nb_page : " + str(nb_page))
1235 # find function used in datou_batch to maje df from text
1236 from lib.batch.lib_batch import create_pandas_table_from_text
1237 from lib.lib_util import parse_json_from_prompt_result
1239 result = input["result"]# if "result" in input else ""
1240 # Il faut vérifier que result = text fasse planter les tests
1241 if type(result) == list:
1242 #result = result[0]
1243 print(" ERROR treated as WARNING : only the first file will be treated : " + str(result))
1244 print("TODO multiple files not implemented yet !")
1245 if multi_input:
1246 if aggregate_multi_input:
1247 # result = " ".join(result)
1248 print("We will need to do something !")
1249 else:
1250 print("internal ERROR : multiple files not implemented yet !")
1252 import pandas as pd
1253 # todo vr 27-12-23 normaliser les input et utilisation des assoc
1254 if multi_input or saxia_split_end_csv != "":
1255 print(" For now aggregate_multi_input is set to True by default in case of multi input")
1256 complete_df = None
1257 for res in result :
1258 if format_input == "markdown":
1259 df = create_pandas_table_from_text(res, verbose = verbose) if "result" in input else pd.DataFrame(columns=["Init"])
1260 elif format_input == "json":
1261 df = pd.DataFrame(parse_json_from_prompt_result(res))
1262 elif format_input == "json_as_dict":
1263 df = pd.DataFrame(res)
1264 else:
1265 print("format_input " + format_input + " not implemented yet ! ")
1266 if type(complete_df) == types.NoneType:
1267 complete_df = df
1268 else:
1269 complete_df = pd.concat([complete_df, df], axis=0, ignore_index=True)
1270 df = complete_df
1271 else:
1272 if format_input == "markdown":
1273 df = create_pandas_table_from_text(result, verbose = verbose) if "result" in input else pd.DataFrame(columns=["Init"])
1274 elif format_input == "json":
1275 df = pd.DataFrame(parse_json_from_prompt_result(result))
1276 else:
1277 print("format_input " + format_input + " not implemented yet ! ")
1279 output = {output_df : df}
1280 for col in col_to_input:
1281 strat = col["strat"] if "strat" in col else "concat_sccsv"
1282 col_name = col["col_name"] if "col_name" in col else None
1283 input_name = col["input_name"] if "input_name" in col else None
1284 all_data_to_clean = list(df[col_name]) if type(df) == pd.DataFrame and col_name in df.columns else []
1285 # TODO better car mieux vaut quick and dirty que de ne pas avancer, vraisemblablement VR 20-12-23
1286 try:
1287 all_data = [d.replace(" ", "") for d in all_data_to_clean] # .replace("-", ",") = VR 14-2-24 a present gérer par le parsing begin_end manuelle avec 3 etats
1288 except :
1289 all_data = all_data_to_clean
1290 io_data_datou = ";".join(all_data)
1291 if input_name :
1292 output[input_name] = io_data_datou
1294 if merge != None or trigger == "nb_pages_80":
1295 df_by_page = df
1296 df_by_document = df # input["df"] if "df" in input else None
1297 # Faire une boucle sur les documents et les pages pour vérifier que c'est correct
1298 # TODO VR 27-12-23 : faire un test pour vérifier que c'est correct
1299 print(" In datou_safia_step_load_tab ")
1300 try:
1301 res1 = df_by_document.to_markdown()
1302 res2 = df_by_page.to_markdown()
1303 res3 = df_by_document.to_json()
1304 res4 = df_by_page.to_json()
1305 if verbose :
1306 print(res1)
1307 print(res2)
1308 print(res3)
1309 print(res4)
1310 except Exception as e:
1311 print(str(e))
1312 try:
1313 # La liste des pages dans df_by_document a pour nom de colonne "Liste des pages"
1314 # L'information dans df_by_page sur le fait d'etre le debut ou la fin d'un document est dans la colonne Information_debut_fin
1315 # Pour chaque ligne de df_by_document, on va chercher les pages correspondantes dans df_by_page et vérifier qu'il n'y a qu'une seule page debut et fin de document sinon on renvoie un mesage d'erreur
1316 # Merci de faire une boucle sur df_by_document et pour chaque ligne de df_by_document, on va chercher les pages correspondantes dans df_by_page et vérifier qu'il n'y a qu'une seule page debut et fin de document sinon on renvoie un mesage d'erreur
1317 list_all_page = []
1318 for index, row in df_by_document.iterrows():
1319 liste_des_pages = row["Liste des pages"]
1320 from lib.lib_util import parse_list_page_as_begin_end_separated
1321 try:
1322 list_page_one_document = list(map(int, liste_des_pages.split(",")))
1323 except Exception as e:
1324 print("Trying to parse list_page_per_doc as begin and end separated : " + str(e))
1325 list_page_one_document = parse_list_page_as_begin_end_separated(liste_des_pages)
1326 print("list_page_one_document : " + str(list_page_one_document))
1327 if len(list_page_one_document) == 0:
1328 print("ERROR")
1330 list_all_page.extend(list_page_one_document)
1331 if False:
1332 list_info_debut_fin = []
1333 for page_nb in list_page_one_document:
1334 df_by_page_one_page = df_by_page[df_by_page["Numéro de La Page"] == page_nb]
1335 Information_debut_fin = df_by_page_one_page["Information_debut_fin"].values
1336 list_info_debut_fin.append(Information_debut_fin)
1337 # Maintenant on compte les multiples
1338 count_begin = list_info_debut_fin.count(lambda x : "debut" in x.lower())
1339 count_end = list_info_debut_fin.count(lambda x : "fin" in x.lower())
1340 if count_begin > 1 or count_end > 1:
1341 print("ERROR OR WARNING multiple debut fin : " + str(list_info_debut_fin))
1343 # On va vérifier que toutes les pages de df_by_page sont bien dans df_by_document sinon on les rajoute séparément comme un document à chaque fois
1344 if list_all_page != list(set(list_all_page)):
1345 print("ERROR OR WARNING Multiple page : " + str(list_all_page))
1347 if set(range(1, nb_page)) != set(list_all_page): # nb_page
1348 print("ALL page : " + str(list_all_page))
1349 list_missing_page = list(set(range(1, nb_page)) - set(list_all_page))
1350 for missing_page in list_missing_page:
1351 import pandas as pd
1352 df_by_document = pd.concat([df_by_document, pd.DataFrame({"Liste des pages" : str(missing_page), "document_type" : "Added for completion"}, index=[len(df_by_document)])], ignore_index=True)
1354 if trigger == "nb_pages_80":
1355 if len(list_missing_page) > 0.2 * float(nb_page):
1356 output["retry"] = True
1357 output["retry_step_id"] = retry_step_id
1358 output["max_nb_try"] = max_nb_try
1359 print("TRIGGER RETRY IN LOAD TAB")
1360 if to_be_used == "split_at_10":
1361 print("Ici on pourrait reconstuirer les texts de input selon un split pris en entrée ou bien un split par défaut par exemple toutes les 10 ou 15 pages")
1362 print("OUI ! Ici on pourrait reconstuirer les texts de input selon un split pris en entrée ou bien un split par défaut par exemple toutes les 10 ou 15 pages")
1364 except Exception as e:
1365 if trigger == "nb_pages_80":
1366 output["retry"] = True
1367 output["retry_step_id"] = retry_step_id
1368 output["max_nb_try"] = max_nb_try
1369 print("ERROR IN TRIGGER RETRY IN LOAD TAB")
1370 print(str(e))
1372 # On va vérifier que l'ordre des pages d'un sous-document n'est pas modifié sinon on renvoie un message
1374# if trigger == "nb_pages_80":
1376# print("ERROR OR WARNING : nb_pages_80 : " + str(len(df)))
1378 if saxia_all_doc_separated:
1379 print("TO TEST !")
1380 if "result" in input:
1381 print("WARNING saxia_all_doc_separated Not expected ! ")
1382 else :
1383 input["result"] = ""
1384 if "text_only_for_meta_data_and_not_split" in input:
1385 input["text"] = input["text_only_for_meta_data_and_not_split"]
1386 else:
1387 print("All split csv should be tested more carefully, missing key text_only_for_meta_data_and_not_split")
1388 del input["text_only_for_meta_data_and_not_split"]
1390 if type(output[output_df]) != types.NoneType:
1391 try:
1392 output[output_df]["prediag"] = "MISSING"
1393 if "prediag_csv" in input and input["prediag_csv"] != "" and input["prediag_csv"] != None:
1394 print("We can add prediag in df but we need list_of_list_of_file")
1395 # ca ca ne va pas : list_page_per_doc
1396# list_page_per_doc = input["list_page_per_doc"] if "list_page_per_doc" in input else ";".join(list(map(str, list(range(1, nb_page + 1)))))
1397# list_of_list_of_page_per_doc = list(map(lambda x: list(map(int, x.split(","))), list_page_per_doc.split(";")))
1399 prediag_csv = input["prediag_csv"]
1400 list_prediag = prediag_csv.split(",")
1401 if len(list_prediag) == nb_page:
1402 # iterer sur le df et pour chaque ligne, on va chercher la page correspondante dans list_page_per_doc
1403 for index, row in output[output_df].iterrows():
1404 liste_des_pages = row["Liste des pages"]
1405 list_page_one_document = []
1406 if liste_des_pages != '':
1407 list_page_one_document = list(map(int, liste_des_pages.split(",")))
1408 # On va chercher la page correspondante dans list_page_per_doc
1409 sub_prediag_csv = ""
1410 for page_nb in list_page_one_document:
1411 if page_nb - 1 < len(list_prediag):
1412 if sub_prediag_csv != "":
1413 sub_prediag_csv += ","
1414 sub_prediag_csv += list_prediag[page_nb - 1]
1415 else:
1416 print("PROBLEMA CHECK !")
1417 output[output_df].loc[index, "prediag"] = sub_prediag_csv
1418 else:
1419 print("PROBLEMB CHECK : " + str(nb_page) + " len(list_prediag) : " + str(len(list_prediag)))
1420 else :
1421 print("MISSING PREDIAG PROBLEMC CHECK !, TO DO PLEASE")
1422 except Exception as e:
1423 print(str(e))
1424 else:
1425 print("ERROR CHECK : df is None")
1427 print("END OF LOAD TAB")
1428 return output
1432# Keep for easy implementation of new function (remove pragma no cover and complete all)
1433# INSERT INTO mtrdatou.datou_step_template (name, function_name, param_json_list, input_list, output_list) VALUES ('anon', 'datou_safia_step_anon', '["openai_token"]', '["list_page_content"]', '["result"]');
1434def datou_safia_step_anon(input : dict = {},
1435 param_json : dict = {},
1436 ce : CE = None,
1437 verbose : bool = False,
1438 layer_api : LayerGeneric = None) -> dict :
1439 list_input = []
1440 list_output = []
1441 list_param_json = []
1443 keyword = param_json["keyword"] if "keyword" in param_json else {}
1444 exclude_word_split = param_json["exclude_word_split"] if "exclude_word_split" in param_json else []
1445 word_to_keep = param_json["word_to_keep"] if "word_to_keep" in param_json else []
1446 exclude_pers = param_json["exclude_pers"] if "exclude_pers" in param_json else ["docteur"]
1447 exclude_bib_start = param_json["exclude_bib_start"] if "exclude_bib_start" in param_json else ["docteur ", "dr. ", "dr "]
1448 append_detected_to_output = bool(param_json["append_detected_to_output"]) if "append_detected_to_output" in param_json else False
1449 anon_all_unread = bool(param_json["anon_all_unread"]) if "anon_all_unread" in param_json else False
1450 # list of keys for which we remove the info in order to make anonymisation
1451 remove_search_string_for_key_private_data = param_json["remove_search_string_for_key_private_data"] if "remove_search_string_for_key_private_data" in param_json else []
1453 result_info_to_anon = input["result"] if "result" in input else ""
1454 list_page_content = input["list_page_content"] if "list_page_content" in input else []
1456 all_paragraphs = param_json["data"]["par"] if "data" in param_json and "par" in param_json["data"] else []
1457 paragraphs_to_anon = []
1458 for one_page in all_paragraphs:
1459 one_page_to_anon = []
1460 for p in one_page:
1461 if "class" in p and p["class"] != "content":
1462 p["xmin"] = p["x"]
1463 p["ymin"] = p["y"]
1464 p["xmax"] = p["x"] + p["w"]
1465 p["ymax"] = p["y"] + p["h"]
1466 p["old_text"] = p["text"]
1467 p["text"] = "KKK"
1468 one_page_to_anon.append(p)
1469 paragraphs_to_anon.append(one_page_to_anon)
1471 hash_id_treatment = input["hash_id_treatment"] if "hash_id_treatment" in input else str(uuid4())
1472 out_folder = input["out_folder"] if "out_folder" in input else None # "static/temp/anon"
1473# out_folder = None
1475 from lib.anon.lib_anon import anon_document
1477 from lib.lib_util import create_prefix_file_name_from_json_prefix
1478 prefix_file = create_prefix_file_name_from_json_prefix(input["prefix_file"]) if "prefix_file" in input else ""
1479 paragraphs_to_anon_copy = paragraphs_to_anon.copy()
1480 list_pngs, pdf_anon, json_info_to_anon = anon_document(result_info_to_anon, list_page_content, verbose=False,
1481 keyword = keyword, hash_id_treatment=hash_id_treatment,
1482 prefix_file = prefix_file,
1483 word_to_keep = word_to_keep,
1484 exclude_word_split=exclude_word_split,
1485 anon_all_unread=anon_all_unread,
1486 remove_search_string_for_key_private_data=remove_search_string_for_key_private_data,
1487 exclude_pers = exclude_pers,
1488 exclude_bib_start = exclude_bib_start,
1489 out_folder=out_folder,
1490 paragraphs_to_anon=paragraphs_to_anon_copy)
1492 if verbose:
1493 print(" pdf_anon : " + str(pdf_anon))
1495 output = {"pdf_anon" : pdf_anon}
1497 if append_detected_to_output:
1498 for key in json_info_to_anon:
1499 if type(json_info_to_anon[key]) == list:
1500 json_info_to_anon[key] = ";".join(json_info_to_anon[key])
1501 output.update(json_info_to_anon)
1503 return output
1508# INSERT INTO mtrdatou.datou_step_template (name, function_name, param_json_list, input_list, output_list) VALUES ('format', 'datou_safia_step_format', '["format_info"]', '["result"]', '["result"]');
1509def datou_safia_step_format(input : dict = {},
1510 param_json : dict = {},
1511 ce : CE = None,
1512 verbose : bool = False,
1513 layer_api : LayerGeneric = None) -> dict :
1514 list_input = []
1515 list_output = []
1516 list_param_json = []
1517 config_project = param_json["config_project"] if "config_project" in param_json else {}
1518 format = config_project["saxia"]["format"] if "saxia" in config_project and "format" in config_project["saxia"] else {}
1519 format_json_from_conf = format["info_format_intro"] if "info_format_intro" in format else {}
1520 default_format_intro_hc = "Le {datet}, {document_type} par le Docteur {medecin_nom}, {medecin_specialite} :"
1521 format_info = format_json_from_conf["format"] if "format" in format_json_from_conf else {}
1522 info_format_intro = format_json_from_conf["format"]["intro"] if "format" in format_json_from_conf and "intro" in format_json_from_conf["format"] else {"default": default_format_intro_hc}
1524 exec_if_true = param_json["exec_if_true"] if "exec_if_true" in param_json else True
1525 if not exec_if_true or exec_if_true == {}:
1526 print(" dont_exec_if_false is True, we skip the formatting step ")
1527 return input
1528 print("PAssed exec if true formatting step ")
1530 append_resume = param_json["append_resume"] if "append_resume" in param_json else False
1531 content_resume = input["content_resume"] if "content_resume" in input else ""
1533 try :
1534 print(" keys input : " + str(input.keys()))
1535 length_input = {k : len(input[k]) if (type(input[k]) != bool and str(type(input[k])) != "<class 'NoneType'>") else 0 for k in input}
1536 print(" length_input : " + str(length_input))
1537 except Exception as e:
1538 print("ERROR Problem with input : " + str(e) + " treated as WARNING ")
1540# "compte_rendu_complet_medecin": "from_json_copy"
1541 compte_rendu_complet_medecin = param_json["compte_rendu_complet_medecin"] if "compte_rendu_complet_medecin" in param_json else ""
1543 list_class_copy = param_json["list_class_copy"] if "list_class_copy" in param_json else []
1544 append_table_doc = (param_json["append_table_doc"] == 1 or param_json["append_table_doc"].lower() == "true") if "append_table_doc" in param_json else False
1545 append_table_page = (param_json["append_table_page"] == 1 or param_json["append_table_page"].lower() == "true") if "append_table_page" in param_json else False
1546 with_hyperlink = (param_json["with_hyperlink"] == 1 or str(param_json["with_hyperlink"]).lower() == "true") if "with_hyperlink" in param_json else False
1547 append_parsing_meta_info_to_table = (param_json["append_parsing_meta_info_to_table"] == 1 or str(param_json["append_parsing_meta_info_to_table"]).lower() == "true") if "append_parsing_meta_info_to_table" in param_json else False
1548 reproduce_format_new_page = (param_json["reproduce_format_new_page"] == 1 or str(param_json["reproduce_format_new_page"]).lower() == "true") if "reproduce_format_new_page" in param_json else False
1549 reorder_paragraph_by_order_lex_token = (param_json["reorder_paragraph_by_order_lex_token"] == 1 or str(param_json["reorder_paragraph_by_order_lex_token"]).lower() == "true") if "reorder_paragraph_by_order_lex_token" in param_json else False
1550 smart_new_line_from_token_pos = (param_json["smart_new_line_from_token_pos"] == 1 or str(param_json["smart_new_line_from_token_pos"]).lower() == "true") if "smart_new_line_from_token_pos" in param_json else False
1551 order_by_date = (param_json["order_by_date"] == 1 or str(param_json["order_by_date"]).lower() == "true") if "order_by_date" in param_json else False
1552 order_by_document_type = (param_json["order_by_document_type"] == 1 or str(param_json["order_by_document_type"]).lower() == "true") if "order_by_document_type" in param_json else False
1553 result_input = input["result"] if "result" in input else ""
1554 df = input["df"] if "df" in input else "None"
1555 df_by_page = input["df_by_page"] if "df_by_page" in input else "None"
1556 input_col_intro = input["input_col_intro"] if "input_col_intro" in input else "intro_correct_typo"
1557 input_col_cr = input["input_col_cr"] if "input_col_cr" in input else "cr_correct_typo"
1558 load_df_from_db_and_correct = (str(input["load_df_from_db_and_correct"]) == "1" or str(input["load_df_from_db_and_correct"]).lower() == "true") if "load_df_from_db_and_correct" in input else False
1559 out_file = input["out_file"] if "out_file" in input else ""
1560 nb_blank_line = param_json["nb_blank_line"] if "nb_blank_line" in param_json else 0
1562 hash_id_treatment = input["hash_id_treatment"] if "hash_id_treatment" in input else str(uuid4())
1564 from lib.lib_util import parse_json_from_prompt_result, format_one_res, complete_date_and_order_json_to_mettre_en_forme, append_id_by_order
1565 list_json_to_mettre_en_forme = parse_json_from_prompt_result(result_input)
1566 list_json_to_mettre_en_forme = append_id_by_order(list_json_to_mettre_en_forme)
1568 import pandas as pd
1569 nb_doc = len(df) if type(df) == pd.DataFrame else 0
1570 nb_page_from_df = len(df_by_page) if type(df_by_page) == pd.DataFrame else 0
1572# if order_by_date :
1573# list_json_to_mettre_en_forme = complete_date_and_order_json_to_mettre_en_forme(list_json_to_mettre_en_forme)
1575 from lib.lib_util import create_prefix_file_name_from_json_prefix
1576 prefix_file = create_prefix_file_name_from_json_prefix(input["prefix_file"]) if "prefix_file" in input else ""
1578 df_complet_as_markdown = ""
1579 df_complet_as_json = ''
1580 nb_modif_manual = -1
1581 nb_modif_class_manual = -1
1582 nb_manual_action_df = -1
1583 nb_manual_action_df_for_col_audit = -1
1584 total_text = ""
1585 result_output = "" # TODO duplicate
1586 if verbose :
1587 print(" load_df_from_db_and_correct : " + str(load_df_from_db_and_correct))
1588 else :
1589 print(" load_df_from_db_and_correct : " + str(load_df_from_db_and_correct)[:100])
1590 out_folder = input["out_folder"] if "out_folder" in input else "temp"
1592 from server.safia import lpgss_singleton
1593 project_id = input["project_id"] if "project_id" in input else param_json["project_id"] if "project_id" in param_json else 70
1594 conf_project = lpgss_singleton.load_conf_project(project_id)
1595 format_json_from_conf = conf_project["saxia"]["format"]["info_format_intro"] if "saxia" in conf_project and "format" in conf_project["saxia"] and "info_format_intro" in conf_project["saxia"]["format"] else {}
1596 info_format_intro_bis = format_json_from_conf["format"]["intro"] if "format" in format_json_from_conf and "intro" in format_json_from_conf["format"] else {"default": default_format_intro_hc}
1598 print(" info_format_intro and info_format_intro_bis should be the same !")
1600 outfile_name_docx = ""
1601 audit_info_count = {}
1602 audit_info_write = {}
1603 # consolidate
1604 if load_df_from_db_and_correct:
1605 from lib.lib_safia_system import LibSafiaSystem
1606 user_id = 0
1607 from server.safia import lib_right_singleton
1608 lss = LibSafiaSystem(lib_user_data_internal=lpgss_singleton, lib_right=lib_right_singleton)
1609 lss.user_id = user_id # CA c'est un hack
1610 hash_id_treatment = input["hash_id_treatment_input"] if "hash_id_treatment_input" in input else "default_value_hash_id_treatment"
1612 from lib.manaudit.lib_datou_audit import load_audit_info_and_apply_manual_correction, list_action_by_user, count_time_lab_by_user
1613 try :
1615 df_auto, df_cons, hash_id_treatment_rerun, results, audit_info_count, id_file, audit_json_file_content_as_json, all_results, info_date = load_audit_info_and_apply_manual_correction(hash_id_treatment_auto = hash_id_treatment,
1616 hash_id_treatment_manual = hash_id_treatment,
1617 lpgss = lss.lib_user_data_internal,
1618 project_id = project_id)
1620 map_user_id_list_page, map_user_id_time_modif, map_user_id_list_pages_for_split = list_action_by_user(all_results, df_auto)
1621 map_interval_sum_by_user = count_time_lab_by_user(all_results)
1622 nb_correction_split = sum(list(map(lambda x: len(map_user_id_list_pages_for_split[x]), map_user_id_list_pages_for_split)))
1623 if nb_correction_split == 0:
1624 split_auto_perfect = True
1625 else:
1626 split_auto_perfect = False
1627 if len(map_user_id_list_page) == 2:
1628 print("We expect a labeliser and a corrector ")
1629 user_id_0 = list(map_user_id_list_page.keys())[0]
1630 user_id_1 = list(map_user_id_list_page.keys())[1]
1631 nb_correction_0 = len(map_user_id_list_page[user_id_0])
1632 nb_correction_1 = len(map_user_id_list_page[user_id_1])
1633 if nb_correction_0 < nb_correction_1:
1634 user_id_labeliser = user_id_1
1635 user_id_corrector = user_id_0
1636 elif nb_correction_0 == nb_correction_1:
1637 print("WARNING EQUAL NUMBER OF CORRECTION BETWEEN LABELISER AND CORRECTOR, WE TAKE THE FIRST AS LABELISER")
1638 user_id_labeliser = user_id_0
1639 user_id_corrector = user_id_1
1640 else :
1641 user_id_labeliser = user_id_0
1642 user_id_corrector = user_id_1
1644 nb_page_no_correction = len(map_user_id_list_page[user_id_labeliser]) - len(map_user_id_list_page[user_id_corrector])
1645 max_nb_page = max(map_user_id_list_page[user_id_labeliser])
1646 time_minute_labelizer = map_interval_sum_by_user[user_id_labeliser]["total_minutes"] if user_id_labeliser in map_interval_sum_by_user and "total_minutes" in map_interval_sum_by_user[user_id_labeliser] else 0.0
1647 time_minute_corrector = map_interval_sum_by_user[user_id_corrector]["total_minutes"] if user_id_corrector in map_interval_sum_by_user and "total_minutes" in map_interval_sum_by_user[user_id_corrector] else 0.0
1648 nb_interval_labelizer = len(map_interval_sum_by_user[user_id_labeliser]["intervals"]) if user_id_labeliser in map_interval_sum_by_user and "intervals" in map_interval_sum_by_user[user_id_labeliser] else 0
1649 nb_interval_corrector = len(map_interval_sum_by_user[user_id_corrector]["intervals"]) if user_id_corrector in map_interval_sum_by_user and "intervals" in map_interval_sum_by_user[user_id_corrector] else 0
1650 info_correction = " nb_page_no_correction : " + str(nb_page_no_correction) + " pourcentage perfect : " + str(int(100 * float(nb_page_no_correction / float(max_nb_page)))) + \
1651 " user_id_labeliser : " + str(user_id_labeliser) + " with " + str(len(map_user_id_list_page[user_id_labeliser])) + f" corrections in {time_minute_labelizer:.2f} minutes in {nb_interval_labelizer} intervals " + \
1652 " user_id_corrector : " + \
1653 str(user_id_corrector) + " with " + str(len(map_user_id_list_page[user_id_corrector])) + f" corrections in {time_minute_corrector:.2f} minutes in {nb_interval_corrector} intervals " + \
1654 ", nb_page_no_correction : " + str(nb_page_no_correction)
1655 nb_page_perfect = nb_page_no_correction
1656 else :
1657 time_minute_labelizer = -1
1658 time_minute_corrector = -1
1659 nb_interval_labelizer = -1
1660 nb_interval_corrector = -1
1661 info_correction = str(len(map_user_id_list_page)) + " users found in correction, we cannot separate labeliser and corrector : " + str(map_user_id_list_page)
1663 if time_minute_labelizer == 0.0 or time_minute_corrector == 0.0:
1664 info_correction += " WARNING time_minute_labelizer : " + str(time_minute_labelizer) + " or time_minute_corrector : " + str(time_minute_corrector) + " is zero "
1665 info_correction += " map_interval_sum_by_user : " + str(map_interval_sum_by_user)
1667 info_correction += " split_auto_perfect : " + str(split_auto_perfect) + " nb_correction_split : " + str(nb_correction_split)
1669 #if type(df_auto_as_json) != types.NoneType:
1670 # df_auto = pd.DataFrame(df_auto_as_json, convert_dates=["datet", "date_entree_hospitalisationt", "date_sortie_hospitalisationt", "date_fin_arret_travailt", "date_debut_arret_travailt"])
1671 nb_modif_manual = sum(list(map(lambda x: len(x["manual_input_info"]["list_actions"]), all_results)))
1672 try:
1673 # nb_modif_class_manual = sum(list(map(lambda x: np.count_nonzero([0 if a["type_action"] != "class_paragraph" else 1 for a in x["manual_input_info"]["list_actions"]]), all_results)))
1674 nb_modif_class_manual = audit_info_count["nb_modif_class_manual"] if "nb_modif_class_manual" in audit_info_count else -2
1675 nb_manual_action_df = audit_info_count["nb_manual_action_df"] if "nb_manual_action_df" in audit_info_count else -2
1676 nb_manual_action_df_for_col_audit = audit_info_count["nb_manual_action_df_for_col_audit"] if "nb_manual_action_df_for_col_audit" in audit_info_count else -2
1677 print(" Faisons mieux au dessus ")
1678 except Exception as e:
1679 print("Error while counting class_paragraph : " + str(e))
1681 df = results["df"]
1683# id_file = all_result["id_file"] if "id_file" in all_result else ""
1685 nb_doc = len(df)
1686 try:
1687 nb_page = len(audit_json_file_content_as_json["io_exec"]["3"]["input"]["paragraphs"])
1688 except Exception as e:
1689 print(str(e))
1690 nb_page = -1
1692 print("TODO warning 14-5-24 ce code doit etre dedupliquer vu que c'est le meme dans les deux parties de la condition consolidate et l'autre")
1693 if order_by_date:
1694 from lib.lib_util import order_df_by_date
1695 df = order_df_by_date(df)
1697 if order_by_document_type:
1698 from lib.lib_util import order_by_document_type
1699 df = order_by_document_type(df)
1701 # Argument to modularize : df, input_col_intro, input_col_cr, out_file, hash_id_treatment
1702 # Output : total_text
1703 # why not output nb_file, nb_page, nb_modif_manual,
1704 from lib.lib_util import write_table_list_inner_document_0424_bis
1705 out_file = id_file + "_h_" + out_file
1706 total_text, outfile_name_docx, audit_info_write = write_table_list_inner_document_0424_bis(df, input_col_intro, input_col_cr, out_file, hash_id_treatment, out_folder, format_info, verbose=verbose,
1707 content_resume=content_resume, append_resume=append_resume)
1709 except Exception as e:
1710 print("Error while loading and correcting df : " + str(e))
1712 else: # format from initial run (not consolidate)
1713 df_auto = None
1714 print(" df len 1058 : " + str(len(df)))
1715 document = None
1716 out_file = ""
1718 print(" df len 1062 : " + str(len(df)))
1720 if compte_rendu_complet_medecin == "from_json_copy":
1721 from lib.lib_util import split_list_page_by_doc
1722 text = input["text"] if "text" in input else None
1723 if text != None and (
1724 len(input["list_page_per_doc"]) == len(text) or (len(input["list_page_per_doc"]) == 1 and type(text) == str)):
1725 print(
1726 "We expect list_text to be the same object, we force it when length is one (ie ONE DOC) : VR 3-6-25")
1727 if type(text) == str:
1728# list_texts = [text]
1729 list_list_page_doc = [[]] # en fait on aura besoin des paragraphs, refacto necessaire !
1730 list_list_page_doc = [[]]
1731 try:
1732 list_list_page_doc = split_list_page_by_doc(input["paragraphs"], input["list_page_per_doc"])
1733 except Exception as e:
1734 print("l 1051 If only one doc it could run !, and maybe also with multiple doc by the way")
1735 print(str(e))
1737 else :
1738 list_list_page_doc = [None] * len(list_json_to_mettre_en_forme)
1740 if len(list_list_page_doc) != len(list_json_to_mettre_en_forme):
1741 print("ERROR TREATED AS WARNING BUT MAKES HUGE ERROR OR MISSING DATA Problem with list_list_page_doc and list_json_to_mettre_en_forme : " + str(len(list_list_page_doc)) + " != " + str(len(list_json_to_mettre_en_forme)))
1743 if append_parsing_meta_info_to_table:
1744 from lib.lib_util import add_parsing_meta_info_to_table
1745 df = add_parsing_meta_info_to_table(df, list_json_to_mettre_en_forme)
1746 else:
1747 print(" ERROR if we do not add 'id' to df, we will not be able to merge with manual correction")
1748 print(" Furthermore if I want to refacto the loop for exporting automatic datou results, I will have to add 'id' to df and also use df instead of list of json")
1750 print(" df len 1077 : " + str(len(df)))
1752 format_out_file = "docx"
1753 out_file = prefix_file + "_" + hash_id_treatment + "." + format_out_file
1755 print(" df len 1091 : " + str(len(df)))
1757 # not used anymore, set as option or remove
1758 if False:
1759 from lib.lib_util import write_table_list_inner_document
1760 import pandas as pd
1761 if append_table_doc and type(df) == pd.DataFrame:
1762 document = write_table_list_inner_document(df, document, with_hyperlink)
1763 if append_table_page and type(df_by_page) == pd.DataFrame:
1764 document = write_table_list_inner_document(df_by_page, document, with_hyperlink=False)
1766 index_for_hyperlink = 0
1767 for index, row in df.iterrows():
1768 one_json = row.to_dict()
1769 list_paragraph_doc = []
1770 if index < len(list_list_page_doc):
1771 list_page_doc = list_list_page_doc[index]
1772 else :
1773 print(" ERROR missing list page doc and we have not verified the rest !")
1774 continue
1775 # et non mais on pourrait faire des verifications TODO 26-4-24 one_json["Liste des pages"] if "Liste des pages" in one_json else ""
1776# for one_json, list_page_doc in zip(list_json_to_mettre_en_forme, list_list_page_doc):
1777 try :
1778 if "document_type" not in one_json:
1779 document_type = "default"
1780 print(" Missing document_type in one result " + str(one_json))
1781 else:
1782 if type(one_json["document_type"]) == list:
1783 document_type = one_json["document_type"][0]
1784 print(" document_type is a list : " + str(one_json["document_type"]) + " treated as " + str(document_type))
1785 else:
1786 document_type = one_json["document_type"]
1788 if compte_rendu_complet_medecin == "from_json_copy":
1789 print(" How to be sure it is the same doc as in the list_page_doc ?? Et oui je crois que grace à l'id ou le fait qu'on a fusionné !")
1790 from lib.lib_util import concat_content_from_list_page_doc
1791 if reorder_paragraph_by_order_lex_token:
1792 print("WILL FAIL WE NEED TO CHECK IF WE HAVE token in list_page_content ")
1793 print(" list_page_content : " + str(list_page_content))
1794 from_json_content_copy = concat_content_from_list_page_doc(list_page_doc,
1795 reproduce_format_new_page=reproduce_format_new_page,
1796 height_line=0,
1797 reorder_paragraph_by_order_lex_token = reorder_paragraph_by_order_lex_token,
1798 smart_new_line_from_token_pos = smart_new_line_from_token_pos,
1799 list_class_copy = list_class_copy)
1800 one_json["compte_rendu_complet_medecin"] = from_json_content_copy
1801 print(" from_json_content_copy : " + str(from_json_content_copy)[:100])
1802 df.loc[index_for_hyperlink, "cr_back"] = "empty"
1804# df.loc[index_for_hyperlink, "cr_back"] = from_json_content_copy.replace("\n\n", "\n")
1805 df.loc[index_for_hyperlink, ['cr_back']] = [from_json_content_copy.replace("\n\n", "\n")]
1806 else:
1807 if type(one_json["compte_rendu_complet_medecin"]) == list:
1808 print(" ERROR OR WARNING How to handle compte_rendu_complet_medecin as list : " + str(one_json["compte_rendu_complet_medecin"]))
1809 df.loc[index_for_hyperlink, "cr_back"] = str(one_json["compte_rendu_complet_medecin"])
1811 # TODO remove VR 25-4-24 : certainement useless
1812 if out_file == "":
1813 out_file = prefix_file + "_" + hash_id_treatment + "." + format_out_file
1815 new_format_info = info_format_intro[document_type] if document_type in info_format_intro else info_format_intro["default"]
1816 print(" new_format_info : " + str(new_format_info) + " document_type : " + str(document_type) + " info_format_intro.keys : " + str(info_format_intro.keys()))
1818 format_premier = format_info["format_premier"] if "format_premier" in format_info else "default"
1819 format_date = format_info["format_date"] if "format_date" in format_info else "default"
1820 list_variable_underline = format_info["list_variable_underline"] if "list_variable_underline" in format_info else []
1821 list_variable_bold = format_info["list_variable_bold"] if "list_variable_bold" in format_info else []
1823 new_intro = format_one_res(one_json, new_format_info, format_premier, format_date, verbose=verbose,
1824 list_variable_bold=list_variable_bold, list_variable_underline=list_variable_underline)
1825 df.loc[index_for_hyperlink, "intro_back"] = new_intro
1826 # TODO sans doute à garder pratique pour debugguer mais en fait calculer par write ...
1827 result_output += new_intro
1828 index_for_hyperlink += 1 # TODO renommer
1830 except Exception as e:
1831 print("Error while parsing one result : " + str(e))
1833 print(" df len 1147 : " + str(len(df)))
1835# input_col_intro = "intro_back"
1836# input_col_cr = "cr_back"
1837# from lib.lib_util import write_table_list_inner_document_0424_bis
1838# total_text = write_table_list_inner_document_0424_bis(df, input_col_intro, input_col_cr, out_file,
1839# hash_id_treatment, out_folder)
1840 # A refacto
1841 # with_hyperlink
1842 # reproduce_format_new_page
1845 try:
1846 if order_by_date:
1847 from lib.lib_util import order_df_by_date
1848 df = order_df_by_date(df)
1849 except Exception as e:
1850 print("CHECK !")
1851 print(str(e))
1853 if order_by_document_type:
1854 from lib.lib_util import order_by_document_type
1855 df = order_by_document_type(df)
1857 if nb_blank_line > 0:
1858 from lib.lib_util import add_blank_line
1859 try:
1860 df = add_blank_line(df, nb_blank_line)
1861 except Exception as e:
1862 print(str(e))
1865 import pandas as pd
1866 df_complet_as_markdown = df.to_markdown() if type(df) == pd.DataFrame else ""
1867 df_complet_as_json = df.to_json() if type(df) == pd.DataFrame else ""
1868 df_auto = None
1870# print(" df_complet_as_markdown len 1152 : " + str(len(df_complet_as_markdown)))
1871 print(" out_folder : " + out_folder)
1873 input_col_intro = "intro_back"
1874 input_col_cr = "cr_back"
1875 from lib.lib_util import write_table_list_inner_document_0424_bis
1876 try:
1877 total_text, outfile_name_docx, audit_info_write = write_table_list_inner_document_0424_bis(df, input_col_intro, input_col_cr, out_file,
1878 hash_id_treatment, out_folder, format_info)
1879 except Exception as e:
1880 print(str(e))
1881 print("Error while computing plop write_table_list_inner_document_0424_bis format")
1883 output = {"result" : result_output, "out_file" : out_file,
1884 "df_complet_as_markdown" : df_complet_as_markdown,
1885 "df_complet_as_json" : df_complet_as_json,
1886 "nb_doc" : nb_doc,
1887 "nb_page_from_df" : nb_page_from_df,
1888 "nb_word_result" : total_text.count(" ") + 1,
1889 "nb_modif_manual" : nb_modif_manual,
1890 "nb_doc_modif_correct_test_2812" : "test_integration_prime_productivite",
1891 "prime_productivite_test_2812" : "test_integration_prime_productivite",
1892 "info_correction" : info_correction if 'info_correction' in locals() else "",
1893 "nb_page_perfect" : nb_page_perfect if 'nb_page_perfect' in locals() else -1,
1894 "time_minute_labelizer" : time_minute_labelizer if 'time_minute_labelizer' in locals() else -1.0,
1895 "user_id_labeliser" : user_id_labeliser if 'user_id_labeliser' in locals() else -1,
1896 "split_auto_perfect" : split_auto_perfect if 'split_auto_perfect' in locals() else None,
1897 "nb_modif_class_manual" : nb_modif_class_manual,
1898 "prefix_file" : prefix_file,
1899 "nb_manual_action_df_for_col_audit" : nb_manual_action_df_for_col_audit,
1900 "nb_manual_action_df" : nb_manual_action_df,
1901 "out_folder" : out_folder,
1902 "df" : df,
1903 "outfile_name_docx" : outfile_name_docx,
1904 "content_commemo" : total_text}
1906 print(str(audit_info_count.keys()))
1908 # Temporaire !
1909 path_csv_complete_with_prediag = "/Users/moilerat/Documents/Fotonower/Safia/prompt/misc/csv_prediag_all_almost/all_csv_prediag.csv"
1910# path_csv_complete_with_prediag = "/home/safia/workarea/git/Safia/prompt/python/misc/csv_prediag_all_almost/all_csv_prediag.csv"
1911 if os.path.exists(path_csv_complete_with_prediag):
1912 try:
1913 import pandas as pd
1914 df_prediag, _ = pd.read_csv(path_csv_complete_with_prediag, sep=";")
1916 from lib.sandbox.migration import migrate_df_complete_with_prediag
1917 df = migrate_df_complete_with_prediag(df_prediag, df, id_file)
1919 except Exception as e:
1920 print("Error while loading csv complete with prediag : " + str(e))
1922 #if limit == 0:
1923 print(" if limit == 0 (mais d'ou le trouve t'on grrr) On pourrait aussi insérer le prediag dans le df et le renvoyer dans le json")
1925 output["map_count_modif_per_doc"] = audit_info_count["map_count_modif_per_doc"] if "map_count_modif_per_doc" in audit_info_count else {}
1926 output["map_modif_type_document"] = audit_info_count["map_modif_type_document"] if "map_modif_type_document" in audit_info_count else {}
1927 output["audit_info_write"] = audit_info_write
1928 try :
1929 for col_ref in ["medecin_nom", "medecin_prenom", "document_type", 'Liste des pages', 'Nombre de pages', 'Titre', 'medecin_specialite', 'nom_hopital', 'genre_service_hopital',
1930 'indication_examen', 'date_entree_hospitalisation',
1931 'date_sortie_hospitalisation', 'motif_hospitalisation',
1932 'date_fin_arret_travail', 'TitreMeta', 'datet', 'date_parsed_or_forced',
1933 'date_fin_arret_travailt', 'date_entree_hospitalisationt', 'date_sortie_hospitalisationt']:
1934 col_auto = col_ref + "_auto"
1935 df[col_auto] = "None"
1937 if col_ref not in df.columns:
1938 print("Missing col ref : " + str(col_ref) + " in df")
1939 continue
1941 if type(df_auto) == pd.DataFrame:
1942 df_auto['Nombre de pages'] = "None"
1943 for index, row in df_auto.iterrows():
1944 if 'Liste des pages' in df_auto.columns:
1945 df_auto.loc[index, 'Nombre de pages'] = str(len(row['Liste des pages'].split(','))) if type(row['Liste des pages']) == str else "None"
1947 if index < len(df):
1948 found_list = df.index[df['Liste des pages'] == row['Liste des pages']].tolist()
1949 for col_ref in ["medecin_nom", "medecin_prenom", "document_type", 'Liste des pages',
1950 'Nombre de pages', 'Titre', 'medecin_specialite', 'nom_hopital',
1951 'genre_service_hopital',
1952 'indication_examen', 'date_entree_hospitalisation',
1953 'date_sortie_hospitalisation', 'motif_hospitalisation',
1954 'date_fin_arret_travail', 'TitreMeta', 'datet', 'date_parsed_or_forced',
1955 'date_fin_arret_travailt', 'date_entree_hospitalisationt',
1956 'date_sortie_hospitalisationt']:
1957 col_auto = col_ref + "_auto"
1958# df[col_auto] = "None"
1960 if col_ref not in df.columns:
1961 print("Missing col ref : " + str(col_ref) + " in df")
1962 continue
1964 if len(found_list) == 0:
1965 print("Missing idx for liste pages " + str(row['Liste des pages']))
1966 if len(found_list) > 1:
1967 print("Warning : more than one idx for liste pages " + str(row['Liste des pages']) + " found : " + str(found_list))
1969 for idxf in found_list:
1970 df.loc[idxf, col_auto] = row[col_ref]
1971# df.loc[index, col_auto] = row[col_ref]
1972 else :
1973 print("Inconsistent dataframe auto and df : " + str(index) + " > " + str(len(df)))
1975 print(" df_auto : " + str(df_auto))
1976 output["df_cons"] = df.to_json()
1977 # Test write and load df.to_dict()
1978 output["df_cons"] = df.to_dict()
1979 output["df_auto"] = df_auto.to_json()
1980 except Exception as e:
1981 print(str(e))
1982 if "hash_id_treatment_input" in input:
1983 output["hash_id_treatment_input"] = input["hash_id_treatment_input"]
1985 return output
1987def datou_safia_step_classify_doc(input : dict = {},
1988 param_json : dict = {},
1989 ce : CE = None,
1990 verbose : bool = False,
1991 layer_api : LayerGeneric = None) -> dict :
1992 list_input = []
1993 list_output = []
1994 list_param_json = []
1995 paragraphs = input["paragraphs"] if "paragraphs" in input else []
1996 rules_classifier = param_json["rules_classifier"] if "rules_classifier" in param_json else []
1997 task = input["task"] if "task" in input else param_json["task"] if "task" in param_json else ""
1999 list_detects = rules_classifier["detect"] if "detect" in rules_classifier else "detect"
2000 map_classifier = rules_classifier["classify"] if "classify" in rules_classifier else "classify"
2001 input_format = param_json["input_format"] if "input_format" in param_json else "markdown"
2003 # task : re_classifier,prepare_prompt,parse_result,merge_result,classify_doc
2005 # "taxonomy_text": {"header":{"key":"H","description":"En-tete et pied de page : adresse(s) de la structure, nom du/des medecins, numéro du secretariat, raison social, Logo, numéro des pages, les titres et diplomes, condition de paiement"},"info_medecin":{- Spécialité et Nom du médecin (des fois en en-tete ou signature (eventuellement tampon) )\n- Date du jour (ou de l'edition)\n- Données personnels du patient : ...\n- Contenu pertinent : \n - A faire selon les classes de document\n - Par défaut\n- Ecriture manuscrite\n- Document administratif (CNI)\n- Autres : Règle administrative, preuve du respect du secret médical\nCertificat établi à la demande de l'intéressé et remis en main propre pour servir et faire valoir ce que de droit.}
2006 taxonomy_text = param_json["taxonomy_text"] if "taxonomy_text" in param_json else {"taxonomy_text":
2007 {"header":
2008 {"key":"H",
2009 "description":"En-tete et pied de page : adresse(s) de la structure, nom du/des medecins, numéro du secretariat, raison social, Logo, numéro des pages, les titres et diplomes, condition de paiement"
2010 },
2011 "info_medecin":
2012 {"key":"M",
2013 "description":"- Spécialité et Nom du médecin (des fois en en-tete ou signature (eventuellement tampon) )\n- Date du jour (ou de l'edition)\n- Données personnels du patient : ...\n- Contenu pertinent : \n - A faire selon les classes de document\n - Par défaut\n- Ecriture manuscrite\n- Document administratif (CNI)\n- Autres : Règle administrative, preuve du respect du secret médical\nCertificat établi à la demande de l'intéressé et remis en main propre pour servir et faire valoir ce que de droit."},
2014 "content":
2015 {"key":"C","description":"Contenu pertinent : description specifique du cas du patient"},
2016 "manuscrit":
2017 {"key":"A","description":"Mots tronqués ou mal orthographier, alphabet étrangère"},
2018 "admin":{"key":"D","description":"Document administratif (CNI) ou autre document administratif"},
2019 "autre":{"key":"O","description":"Autres : Règle administrative, preuve du respect du secret médical, par exemple le texte : Certificat établi à la demande de l'intéressé et remis en main propre pour servir et faire valoir ce que de droit."}
2020 }
2021 }
2022# map_taxonomy_per_key = {taxonomy_text[k]["key"] : k if "key" in taxonomy_text[k] else k : k for k in taxonomy_text}
2023 try :
2024 map_taxonomy_per_key = {taxonomy_text[k]["key"] : k for k in taxonomy_text}
2025 except Exception as e:
2026 print(str(e))
2027 map_taxonomy_per_key = {}
2029 if task == "re_classifier":
2030 from lib.lib_ml.lib_text_classifier import classify_text
2031 for page in list_page_content:
2032 for paragraph in page.list_blocks["paragraphs"]:
2033 for detect in list_detects:
2034 res = classify_text(paragraph["text"], list_detects[detect], verbose = verbose)
2035 if len(res) > 0:
2036 print("detect : " + str(detect) + " res : " + str(res))
2037 if detect in map_classifier:
2038 paragraph["class"] = map_classifier[detect]
2039 else :
2040 print("Missing class, by default we keep it as content_classn but later since this case is not used in production as of 4-3-24")
2041 elif task == "prepare_prompt": # prepare_prompt_classifier_bib
2042 from lib.prompt.lib_gen_prompt import generate_prompt_classifier
2043 preprompt = generate_prompt_classifier(taxonomy_text, verbose = verbose, output_type = input_format)
2044 # TODO add in input input_type = "json", output_type = "table",
2045 # language = "fr",
2046 # output_key = "id", output_class = "classe",
2047 # separator
2048 text = str(paragraphs[0]) if len(paragraphs) > 0 else ""
2049 output = {"preprompt" : preprompt, "text" : text}
2050 elif task == "parse_result": # parse_result_prompt_classifier_bib
2051 result = input["result"] if "result" in input else ""
2052 from lib.batch.lib_batch import create_pandas_table_from_text
2053 from lib.lib_util import parse_json_from_prompt_result
2055 if input_format == "markdown":
2056 df = create_pandas_table_from_text(result, verbose = verbose)
2057 elif input_format == "json":
2058 df = parse_json_from_prompt_result(result, verbose = verbose)
2059 else :
2060 print("ERROR Unsupported input_format : " + str(input_format))
2061 # df = {}
2062 print("TODO finish voila (add in document if only one document)")
2063 if len(paragraphs) != 1:
2064 print("only one document is managed since we use table as output for prompt !")
2065 else :
2066 output_key = "id"
2067 output_class = "classe"
2068 for paragraph in paragraphs[0]:
2069 import sys
2070 sys.stdout.write("ç")
2071 if "id" not in paragraph:
2072 print("Missing id in paragraph : " + str(paragraph))
2073 continue
2074 id = paragraph["id"]
2075 data_found = df[df["id"] == str(id)]
2076 key_classes = data_found[output_class] if output_class in data_found else []
2077 if len(key_classes) == 1:
2078 key_class = key_classes[id]
2079 else:
2080 print("Missing class")
2081 key_class = "unknown"
2082# key_class = df[output_class][df["id"] == str(id)]
2083 paragraph["class"] = map_taxonomy_per_key[key_class] if key_class in map_taxonomy_per_key else "unknown"
2084 output = {}
2086 elif task == "merge_result":
2087 pass
2088 elif task == "classify_doc":
2089 pass
2090 elif task == "detect_name_camembert": # context_entity_camembert
2091 from lib.lib_ml.lib_nlp.lib_pipeline_ner import detect_name_ner
2092 input_text = input["text"] if "text" in input else ""
2093 name_pretrained_model = param_json["name_pretrained_model"] if "name_pretrained_model" in param_json else "Jean-Baptiste/camembert-ner"
2094 name_tokenizer = param_json["name_tokenizer"] if "name_tokenizer" in param_json else "Jean-Baptiste/camembert-ner"
2095 what_I_want = param_json["what_I_want"] if "what_I_want" in param_json else "PER"
2096 aggregation_strategy = param_json["aggregation_strategy"] if "aggregation_strategy" in param_json else "simple"
2097 list_to_treat = [input_text]
2098 if False and "list_page_content" in input:
2099 list_page_content = input["list_page_content"]
2100 list_to_treat = []
2101 for l in list_page_content:
2102# list_to_treat.append(l["text"])
2103 list_to_treat.append(l.content)
2105 all_list_name = []
2106 for input_text_aux in list_to_treat:
2107 list_name = detect_name_ner(input_text_aux,
2108 name_pretrained_model=name_pretrained_model,
2109 name_tokenizer=name_tokenizer,
2110 what_I_want=what_I_want,
2111 aggregation_strategy=aggregation_strategy)
2112 all_list_name.extend(list_name)
2114 print(" list_name : " + str(all_list_name))
2115 output = {"pers" : all_list_name}
2116 output = {"pers" : all_list_name, "result" : "```json\n{\"PERS\":[" + ",".join(list(map(lambda x : "\"" + x + "\"", all_list_name))) + "]}\n```"}
2117 elif task == "parse_result_camembert": # parse_result_camembert
2118 list_name = input["pers"] if "pers" in input else []
2120 pass
2121 elif task == "prepare_anon_from_camembert": # parse_result_camembert
2122 list_name = input["pers"] if "pers" in input else []
2124 pass
2125 elif task == "create_entity_bib_from_camembert": # parse_result_camembert
2126 pass
2127 else :
2128 print("Unsupported task : " + str(task))
2129 output = {"result": "Unsupported task : " + str(task)}
2131 import sys
2132 sys.stdout.write("O")
2133 return output
2136def datou_safia_step_client(input : dict = {},
2137 param_json : dict = {},
2138 ce : CE = None,
2139 verbose : bool = False,
2140 layer_api : LayerGeneric = None) -> dict : # pragma no cover
2141 list_input = ["file", "preprompt", "model", "files"]
2142 list_output = []
2143 list_param_json = ["host", "protocol", "port", "end_point", "preprompt"]
2145 end_point = param_json["end_point"] if "end_point" in param_json else "api/v1/upload"
2146 host = param_json["host"] if "host" in param_json else "localhost"
2147 protocol = param_json["protocol"] if "protocol" in param_json else "http"
2148 port = param_json["port"] if "port" in param_json else 4998
2150 file = input["file"] if "file" in input else ""
2151 files = input["files"] if "files" in input else []
2152 preprompt = input["preprompt"] if "preprompt" in input else param_json["preprompt"] if "preprompt" in param_json else ""
2153 model = input["model"] if "model" in input else ""
2155 import logging
2156 logger = logging.getLogger()
2157 logger.info("In datou_safia_step_client l 2081 ") # + str(__line__))
2159 print("file : " + str(file))
2160 print("files : " + str(files))
2161 print("preprompt : " + str(preprompt))
2162 print("model : " + str(model))
2163 # Send http form multipart request
2164 # TODO : send file and preprompt to the server
2165 # TODO : get result from server
2166 # TODO : return result
2167 import requests
2168 epls = end_point.lstrip('/')
2169 url = f"{protocol}://{host}:{port}/{epls}"
2170 if len(files) == 0:
2171 files.append(file)
2172 map_res_file = {}
2173 map_full_res_file = {}
2174 for f in files:
2175 import os
2176 fbn = os.path.basename(f)
2177 file_json = {'file': open(f, 'rb')}
2178 logger.info("In datou_safia_step_client l 2102 preprompt " + str(preprompt[:100])) # + str(__line__))
2180 data = {'preprompt': preprompt} #, 'model': model}
2181 if model != None and model != '':
2182 data['model_name'] = model
2183 try :
2184 response = requests.post(url, files=file_json, data=data)
2185 if response.status_code == 200:
2186 print("File uploaded successfully")
2187 else:
2188 print("File upload failed")
2190 logger.info("In datou_safia_step_client l 2112 response received ") # + str(__line__))
2191 logger.info("In datou_safia_step_client l 2112 response.status_code " + str(response.status_code)) # + str(__line__))
2193 print("TO USE TO CREATE NEW STEP")
2195 import json
2196 res_parsed_json = json.loads(response.content.decode('utf-8'))
2197 except Exception as e:
2198 print("Error while sending file to server in datou_Step_client : " + str(e))
2199 res_parsed_json = {}
2201 some_useful_result = res_parsed_json["res"] if "res" in res_parsed_json else []
2202 one_useful_result = some_useful_result[0] if len(some_useful_result) > 0 else ""
2203 result_as_array = one_useful_result.split("assistant<|end_header_id|>")
2204 if len(result_as_array) > 1:
2205 the_useful_result = result_as_array[1].replace("\n", "").rstrip("<|eot_id|>")
2206 else :
2207 the_useful_result = ""
2209 map_res_file[fbn] = the_useful_result
2210 map_full_res_file[fbn] = res_parsed_json
2211# assistant<|end_header_id|>
2212 #
2213 # TABLEAU<|eot_id|>
2215 logger.info("In datou_safia_step_client l 2138 wip ")
2216 csv_result = ""
2217 for f in map_res_file:
2218 if csv_result != "":
2219 csv_result += ","
2220 csv_result += map_res_file[f]
2222 output = {"map_result" : map_res_file,
2223 "result" : csv_result,
2224 "full_result" : map_full_res_file}
2226 return output
2230# Keep for easy implementation of new function (remove pragma no cover and complete all)
2231# Some time it will be also needed to do DEV DOC : INSERT INTO mtrdatou.datou_step_template (name, function_name, param_json_list, input_list, output_list) VALUES ('append_to_doc', 'datou_safia_step_append_to_doc_content', '["openai_token"]', '["result", "document_id", "project_id", "user_id"]', '["result", "references"]');
2232def datou_safia_step_TEMPLATE(input : dict = {},
2233 param_json : dict = {},
2234 ce : CE = None,
2235 verbose : bool = False,
2236 layer_api : LayerGeneric = None) -> dict : # pragma no cover
2237 list_input = []
2238 list_output = []
2239 list_param_json = []
2241 print("TO USE TO CREATE NEW STEP")
2243 output = {"result" : "some_result (but TEMPLATE)"}
2245 return output