Coverage for lib/anon/lib_anon.py: 78%
184 statements
« prev ^ index » next coverage.py v7.9.1, created at 2026-02-18 02:40 +0100
« prev ^ index » next coverage.py v7.9.1, created at 2026-02-18 02:40 +0100
3def map_index_to_anon(json_info_to_anon,
4 verbose = False,
5 remove_search_string_for_key_private_data = []):
6 map_to_anon = {}
7 list_text_contains_to_find = []
8 for key in json_info_to_anon:
9 for word in json_info_to_anon[key]:
10 if word.lower() in map_to_anon:
11 print("WARNING : word " + str(word) + " already in map_to_anon but seemlingly with different ")
12 else:
13 map_to_anon[word.lower()] = key
14 if key in remove_search_string_for_key_private_data:
15 list_text_contains_to_find.extend(json_info_to_anon[key])
17 list_text_contains_to_find_ordered_by_decreasing_length = sorted(list_text_contains_to_find, key = lambda x : len(x), reverse = True)
19 return map_to_anon, list_text_contains_to_find_ordered_by_decreasing_length
21def write_text_on_image(image, text, xmin, ymin,
22 xmax = None,
23 ymax = None,
24 color = (0, 0, 0),
25font_size = 5,
26 old_text = ""):
27 import cv2
28 font = cv2.FONT_HERSHEY_SIMPLEX
30 if xmax != None and ymax != None:
31 #print(" TODO We want to compute font_size automatically, to be done ! ")
32 font_size = (ymax - ymin) / 20
34# shape = [(xmax, ymax), (xmin, ymin)]
35 # create rectangle image
36# img1 = ImageDraw.Draw(img)
37# img1.rectangle(shape, fill="# ffff33", outline="red")
39# x, y, w, h = cv2.boundingRect(contour)
40 cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (255, 200, 255), -1)
41 # cv2.putText(image, text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)
43 if len(text) > len(old_text):
44 text = text[0:len(old_text)]
46 cv2.putText(image, text, (xmin, ymax), font, font_size, color, 2, cv2.LINE_AA)
48# from PIL import Image
49# from PIL import ImageFont
50# from PIL import ImageDraw
51#
52# img = Image.open("sample_in.jpg")
53# draw = ImageDraw.Draw(img)
54# # font = ImageFont.truetype(<font-file>, <font-size>)
55# font = ImageFont.truetype("sans-serif.ttf", 16)
56# # draw.text((x, y),"Sample Text",(r,g,b))
57# draw.text((0, 0),"Sample Text",(255,255,255),font=font)
58# img.save('sample-out.jpg')
61def write_blank_on_read_text_then_find_mask_all_not_read(source_image, output_image, list_boxes_unread, verbose = False):
62 import cv2
63 image = cv2.imread(source_image)
65 for text_to_write in list_boxes_unread:
66 xmin = text_to_write["x"]
67 ymin = text_to_write["y"]
68 xmax = xmin + text_to_write["w"]
69 ymax = ymin + text_to_write["h"]
70 cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (255, 255, 200), -1)
72 cv2.imwrite(output_image, image)
75def detect_not_read_text_then_find_mask_all_not_read(source_image,
76 output_image_anon_not_read,
77 list_boxes_read, verbose = False,
78 lower_threshold_white = 250,
79 param_threshold = {}):
80 min_area_box_unread = param_threshold["min_area_box_unread"] if "min_area_box_unread" in param_threshold else 100
81 min_pytesseract_confidence = param_threshold["min_pytesseract_confidence"] if "min_pytesseract_confidence" in param_threshold else 40
82 import cv2
84 image = cv2.imread(source_image)
85 for text_to_write in list_boxes_read:
86 xmin = text_to_write["x"]
87 xmax = text_to_write["x"] + text_to_write["w"]
88 ymin = text_to_write["y"]
89 ymax = text_to_write["y"] + text_to_write["h"]
90 # max(list(map(lambda x: x["y"], text_to_write["vertices"])))
92 if text_to_write["text"] != "" and ("conf" not in text_to_write or text_to_write["conf"] > min_pytesseract_confidence):
93 cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (255, 200, 255), -1)
95 cv2.imwrite(output_image_anon_not_read + "_read_color.png", image)
97 image = cv2.imread(source_image)
98 for text_to_write in list_boxes_read:
99 xmin = text_to_write["x"]
100 xmax = text_to_write["x"] + text_to_write["w"]
101 ymin = text_to_write["y"]
102 ymax = text_to_write["y"] + text_to_write["h"]
103 if text_to_write["text"] != "" and ("conf" not in text_to_write or text_to_write["conf"] > min_pytesseract_confidence):
104 cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (255, 255, 255), -1)
106 cv2.imwrite(output_image_anon_not_read, image)
108 gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
110 # threshold input image as mask
111 mask = cv2.threshold(gray, 240, 240, cv2.THRESH_BINARY)[1]
113 list_boxes_hide_non_white = []
114 # uses scikit-image to find rectangle boxes with non white pixels
115 from skimage import measure
116 #from scikit-image import measure
117 import numpy as np
118 # find contours in the binary image
119 contours = measure.find_contours(mask, 0.8)
120 # create a mask with the same shape as the image
121 mask = np.zeros_like(mask, dtype=np.uint8)
122 # loop over the contours
123 for contour in contours:
124 # draw the contour on the mask
125 cv2.drawContours(mask, [contour.astype(int)], -1, 255, -1)
126 # get the bounding box of the contour
127 list_xy = contour.astype(int).ravel().tolist()
128 x = min(list_xy[1::2])
129 y = min(list_xy[0::2])
130 w = max(list_xy[1::2]) - x
131 h = max(list_xy[0::2]) - y
132 one_box = {"x" : x, "w" : w, "y" : y, "h" : h, "class" : "unread", "text" : ""}
133 mean_color = np.mean(image[y:y+h, x:x+w])
134 current_area = w * h
135 if verbose:
136 print(" current_area : " + str(current_area) + " mean_color : " + str(mean_color) + " x : " + str(x) + " y : " + str(y) + " w : " + str(w) + " h : " + str(h) + " mean_color : " + str(mean_color) + " lower_threshold_white : " + str(lower_threshold_white) + " min_area_box_unread : " + str(min_area_box_unread) + " current_area > min_area_box_unread : " + str(current_area > min_area_box_unread) + " mean_color < lower_threshold_white : " + str(mean_color < lower_threshold_white))
137 if current_area > min_area_box_unread and mean_color < lower_threshold_white:
138 list_boxes_hide_non_white.append(one_box)
140 print(" We have not yet rewrite image !")
142 return image, list_boxes_hide_non_white
146def split_name_or_adresse(list_nom,
147 exclude_word_split = ["du","de","d'", "patient", "patiente", "place"],
148 exclude_bib_start = []):
149 new_list = []
150 for nom in list_nom:
151 list_sub_nom = nom.lower().split(" ")
152 #if len(list_sub_nom) > 5:
153 # new_list.append(nom)
154 # print(" We have too much words in " + str(nom) + " we keep it as is ! ")
155 # continue
156 start_with_exclude = False
157 # Tout ceci aurait du etre fait lors de la selection des noms
158 for excl in exclude_bib_start:
159 if nom.lower().startswith(excl.lower()) or " " + excl.lower() in nom.lower():
160 start_with_exclude = True
161 if start_with_exclude:
162 continue
163 for sub_nom in list_sub_nom:
164 if sub_nom not in exclude_word_split and len(sub_nom) > 2:
165 new_list.append(sub_nom)
166# new_list.append(sub_nom + ",")
168# for i in range(len(list_sub_nom)):
169# for j in range(i+1, len(list_sub_nom)):
170# new_potential_name = " ".join(list_sub_nom[i:j])
171# if new_potential_name not in word_to_keep and new_potential_name not in exclude_word_split
172# new_list.append(new_potential_name)
174 # TODO unicize new_list VR 20-4-24
175 return new_list
179def write_image_with_anon_text(source_image, source_image_anon,
180 list_text_to_write, verbose = False,
181 strat_replace_private_data = "replace_by_keyword"):
182 print(" TO TEST ! ")
183 import cv2
184 img = cv2.imread(source_image)
185 for text_to_write in list_text_to_write:
186 write_text_on_image(img, text_to_write["text"], text_to_write["xmin"], text_to_write["ymin"],
187 text_to_write["xmax"], text_to_write["ymax"], old_text=text_to_write["old_text"])
188 cv2.imwrite(source_image_anon, img)
192def anon_document(result_info_to_anon, list_page_content,
193 verbose = False,
194 keyword = {},
195 hash_id_treatment = "default_anon_document_warning",
196 prefix_file = "",
197 word_to_keep = ["patient", "patiente"],
198 exclude_word_split = ["de","d'","du"],
199 word_trigger_hide_all_page = ["Carte Nationale d'identite"],
200 anon_all_unread=False,
201 remove_search_string_for_key_private_data=[],
202 exclude_pers = ["docteur"],
203 exclude_bib_start = [],
204 out_folder = None,
205 paragraphs_to_anon = []): # TODO voila plop
206 from lib.lib_util import parse_json_from_prompt_result
207 json_info_to_anon = parse_json_from_prompt_result(result_info_to_anon, verbose = verbose, lazy = False)
209 if type(json_info_to_anon) == list and len(json_info_to_anon) == 1:
210 json_info_to_anon = json_info_to_anon[0]
211 else:
212 print("WARNING : json_info_to_anon seems to have more than one element, we take the first one !")
213 # Test if json_info_to_anon is a list of dict
214 if type(json_info_to_anon) == list:
215 if len(json_info_to_anon) == 0:
216 print("WARNING : json_info_to_anon is empty, we set an empty dict !")
217 json_info_to_anon = {}
218 elif len(json_info_to_anon) > 0 and type(json_info_to_anon[0]) == dict:
219 json_info_to_anon = json_info_to_anon[0]
220 if len(json_info_to_anon) > 1:
221 print("WARNING MISSING DATA : json_info_to_anon seems to have more than one element, we take the first one !" + str(json_info_to_anon))
222 else:
223 print("WARNING : json_info_to_anon is a list but not of dict ERROR raised !")
224 print("json_info_to_anon : " + str(json_info_to_anon))
225 # il faut lancer une exception
226 raise Exception("json_info_to_anon is a list but not of dict ERROR raised !")
228 unsplit_json_info_to_anon = json_info_to_anon.copy()
230 # Split name and address
231 if "nom" in json_info_to_anon:
232 json_info_to_anon["nom"] = split_name_or_adresse(json_info_to_anon["nom"], exclude_word_split = exclude_word_split)
233 if "prenom" in json_info_to_anon:
234 json_info_to_anon["prenom"] = split_name_or_adresse(json_info_to_anon["prenom"], exclude_word_split = exclude_word_split)
235 if "adresse" in json_info_to_anon:
236 json_info_to_anon["adresse"] = split_name_or_adresse(json_info_to_anon["adresse"], exclude_word_split = exclude_word_split)
237 if "PERS" in json_info_to_anon:
238 json_info_to_anon["PERS"] = split_name_or_adresse(json_info_to_anon["PERS"], exclude_word_split = exclude_word_split, exclude_bib_start = exclude_bib_start)
240 map_to_anon, list_word_contains = map_index_to_anon(json_info_to_anon, verbose = verbose, remove_search_string_for_key_private_data = remove_search_string_for_key_private_data )
241 for k in keyword: # TODO je ne sais pas a quoi sert cette boucle VR 20-2-24
242 map_to_anon[k] = "XXXXXX"
244 list_images_anon = []
246 # anon_all_unread
248 if len(paragraphs_to_anon) < len(list_page_content):
249 paragraphs_to_anon = paragraphs_to_anon + [[]] * (len(list_page_content) - len(paragraphs_to_anon))
250 for (sub_doc_page, par_to_anon_one_page) in zip(list_page_content, paragraphs_to_anon):
251 if verbose:
252 print("sub_doc_page : " + str(sub_doc_page))
254 list_boxes_read = sub_doc_page.list_boxes
255 list_boxes_read = sub_doc_page.list_blocks["paragraphs"]
256 list_text_to_write = []
258 list_text_to_write = par_to_anon_one_page.copy()
260 for box in list_boxes_read:
261 if verbose:
262 print("box : " + str(box))
263 test_to_find = box["text"].lower().rstrip(" ").lstrip(" ")
265 # just looking for docteur
266 if len(test_to_find) < 30:
267 for word in exclude_pers:
268 if word in test_to_find:
269 break
270 if test_to_find in map_to_anon:
271 xmin = box["x"]
272 ymin = box["y"]
273 xmax = xmin + box["w"]
274 ymax = ymin + box["h"]
275 new_text = {"text" : map_to_anon[test_to_find],
276 "xmin" : int(xmin),
277 "xmax" : int(xmax),
278 "ymin" : int(ymin),
279 "ymax" : int(ymax),
280 "old_text" : box["text"]}
281# "font_size" : box.font_size}
282 list_text_to_write.append(new_text)
283 # print(" TODO add y or z or voila")
284 else :
285 # CEci ne semble servir qu'à l'anonymisation après gcp_doc_ai ou on exporte les paragraph et pas seulement les mots !
286 for word in list_word_contains:
287 if ("conf" not in box or box["conf"] >= 0) and test_to_find != "" and word in box["text"].lower(): # VR TODO il faudra faire une verification que c'es tun mot complet pour eviter si un nom est une partie d'un autre mot courant
288 xmin = box["x"]
289 ymin = box["y"]
290 xmax = xmin + box["w"]
291 ymax = ymin + box["h"]
292 new_text_str = box["text"].lower().replace(word, map_to_anon[word])
293 new_text = {"text" : map_to_anon[word], # VR TODO a corriger
294 "xmin" : int(xmin),
295 "xmax" : int(xmax),
296 "ymin" : int(ymin),
297 "ymax" : int(ymax),
298 "old_text" : new_text_str}
299 list_text_to_write.append(new_text)
300 break
302 source_image_anon = sub_doc_page.source_image + "_anon.png"
303 list_images_anon.append(source_image_anon)
304 write_image_with_anon_text(sub_doc_page.source_image, source_image_anon,
305 list_text_to_write, verbose = verbose, strat_replace_private_data = "replace_by_keyword")
307 # remove first box that is on all the page
308 if len(list_boxes_read) > 0:
309 print("TODO check that is is correct and get size of all the page")
310 list_boxes_read = list_boxes_read[1:]
312 output_image_anon_not_read = source_image_anon + "_not_read.png"
314 image_very_anon_np, list_boxes_hide_non_white = detect_not_read_text_then_find_mask_all_not_read(sub_doc_page.source_image,
315 output_image_anon_not_read,
316 list_boxes_read,
317 verbose=False,
318 lower_threshold_white=250)
320 if anon_all_unread:
321 import cv2
322 write_blank_on_read_text_then_find_mask_all_not_read(source_image_anon, source_image_anon, list_boxes_hide_non_white, verbose = verbose)
324 if "paragraphs" not in sub_doc_page.list_blocks:
325 sub_doc_page.list_blocks["paragraphs"] = []
326 sub_doc_page.list_blocks["paragraphs"].extend(list_boxes_hide_non_white)
328 print(" TOTEST : TODO VR 15-6-23 : concat list_images_anon in pdf ! ")
330 from PIL import Image # install by > python3 -m pip install --upgrade Pillow # ref. https://pillow.readthedocs.io/en/latest/installation.html#basic-installation
332 pdf_name = "anon_" + prefix_file + "_" + hash_id_treatment + "_anon.pdf"
333 import os
334 out_folder = out_folder if out_folder != None else os.path.dirname(list_images_anon[0])
335 pdf_path = os.path.join(out_folder, pdf_name)
337 images = [
338 Image.open(f)
339 for f in list_images_anon
340 ]
342 images[0].save(
343 pdf_path, "PDF", resolution=100.0, quality=70, save_all=True, append_images=images[1:]
344 )
346 return list_images_anon, pdf_path, unsplit_json_info_to_anon