Coverage for lib/anon/lib_anon.py: 78%

184 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2026-02-18 02:40 +0100

1 

2 

3def map_index_to_anon(json_info_to_anon, 

4 verbose = False, 

5 remove_search_string_for_key_private_data = []): 

6 map_to_anon = {} 

7 list_text_contains_to_find = [] 

8 for key in json_info_to_anon: 

9 for word in json_info_to_anon[key]: 

10 if word.lower() in map_to_anon: 

11 print("WARNING : word " + str(word) + " already in map_to_anon but seemlingly with different ") 

12 else: 

13 map_to_anon[word.lower()] = key 

14 if key in remove_search_string_for_key_private_data: 

15 list_text_contains_to_find.extend(json_info_to_anon[key]) 

16 

17 list_text_contains_to_find_ordered_by_decreasing_length = sorted(list_text_contains_to_find, key = lambda x : len(x), reverse = True) 

18 

19 return map_to_anon, list_text_contains_to_find_ordered_by_decreasing_length 

20 

21def write_text_on_image(image, text, xmin, ymin, 

22 xmax = None, 

23 ymax = None, 

24 color = (0, 0, 0), 

25font_size = 5, 

26 old_text = ""): 

27 import cv2 

28 font = cv2.FONT_HERSHEY_SIMPLEX 

29 

30 if xmax != None and ymax != None: 

31 #print(" TODO We want to compute font_size automatically, to be done ! ") 

32 font_size = (ymax - ymin) / 20 

33 

34# shape = [(xmax, ymax), (xmin, ymin)] 

35 # create rectangle image 

36# img1 = ImageDraw.Draw(img) 

37# img1.rectangle(shape, fill="# ffff33", outline="red") 

38 

39# x, y, w, h = cv2.boundingRect(contour) 

40 cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (255, 200, 255), -1) 

41 # cv2.putText(image, text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2) 

42 

43 if len(text) > len(old_text): 

44 text = text[0:len(old_text)] 

45 

46 cv2.putText(image, text, (xmin, ymax), font, font_size, color, 2, cv2.LINE_AA) 

47 

48# from PIL import Image 

49# from PIL import ImageFont 

50# from PIL import ImageDraw 

51# 

52# img = Image.open("sample_in.jpg") 

53# draw = ImageDraw.Draw(img) 

54# # font = ImageFont.truetype(<font-file>, <font-size>) 

55# font = ImageFont.truetype("sans-serif.ttf", 16) 

56# # draw.text((x, y),"Sample Text",(r,g,b)) 

57# draw.text((0, 0),"Sample Text",(255,255,255),font=font) 

58# img.save('sample-out.jpg') 

59 

60 

61def write_blank_on_read_text_then_find_mask_all_not_read(source_image, output_image, list_boxes_unread, verbose = False): 

62 import cv2 

63 image = cv2.imread(source_image) 

64 

65 for text_to_write in list_boxes_unread: 

66 xmin = text_to_write["x"] 

67 ymin = text_to_write["y"] 

68 xmax = xmin + text_to_write["w"] 

69 ymax = ymin + text_to_write["h"] 

70 cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (255, 255, 200), -1) 

71 

72 cv2.imwrite(output_image, image) 

73 

74 

75def detect_not_read_text_then_find_mask_all_not_read(source_image, 

76 output_image_anon_not_read, 

77 list_boxes_read, verbose = False, 

78 lower_threshold_white = 250, 

79 param_threshold = {}): 

80 min_area_box_unread = param_threshold["min_area_box_unread"] if "min_area_box_unread" in param_threshold else 100 

81 min_pytesseract_confidence = param_threshold["min_pytesseract_confidence"] if "min_pytesseract_confidence" in param_threshold else 40 

82 import cv2 

83 

84 image = cv2.imread(source_image) 

85 for text_to_write in list_boxes_read: 

86 xmin = text_to_write["x"] 

87 xmax = text_to_write["x"] + text_to_write["w"] 

88 ymin = text_to_write["y"] 

89 ymax = text_to_write["y"] + text_to_write["h"] 

90 # max(list(map(lambda x: x["y"], text_to_write["vertices"]))) 

91 

92 if text_to_write["text"] != "" and ("conf" not in text_to_write or text_to_write["conf"] > min_pytesseract_confidence): 

93 cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (255, 200, 255), -1) 

94 

95 cv2.imwrite(output_image_anon_not_read + "_read_color.png", image) 

96 

97 image = cv2.imread(source_image) 

98 for text_to_write in list_boxes_read: 

99 xmin = text_to_write["x"] 

100 xmax = text_to_write["x"] + text_to_write["w"] 

101 ymin = text_to_write["y"] 

102 ymax = text_to_write["y"] + text_to_write["h"] 

103 if text_to_write["text"] != "" and ("conf" not in text_to_write or text_to_write["conf"] > min_pytesseract_confidence): 

104 cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (255, 255, 255), -1) 

105 

106 cv2.imwrite(output_image_anon_not_read, image) 

107 

108 gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) 

109 

110 # threshold input image as mask 

111 mask = cv2.threshold(gray, 240, 240, cv2.THRESH_BINARY)[1] 

112 

113 list_boxes_hide_non_white = [] 

114 # uses scikit-image to find rectangle boxes with non white pixels 

115 from skimage import measure 

116 #from scikit-image import measure 

117 import numpy as np 

118 # find contours in the binary image 

119 contours = measure.find_contours(mask, 0.8) 

120 # create a mask with the same shape as the image 

121 mask = np.zeros_like(mask, dtype=np.uint8) 

122 # loop over the contours 

123 for contour in contours: 

124 # draw the contour on the mask 

125 cv2.drawContours(mask, [contour.astype(int)], -1, 255, -1) 

126 # get the bounding box of the contour 

127 list_xy = contour.astype(int).ravel().tolist() 

128 x = min(list_xy[1::2]) 

129 y = min(list_xy[0::2]) 

130 w = max(list_xy[1::2]) - x 

131 h = max(list_xy[0::2]) - y 

132 one_box = {"x" : x, "w" : w, "y" : y, "h" : h, "class" : "unread", "text" : ""} 

133 mean_color = np.mean(image[y:y+h, x:x+w]) 

134 current_area = w * h 

135 if verbose: 

136 print(" current_area : " + str(current_area) + " mean_color : " + str(mean_color) + " x : " + str(x) + " y : " + str(y) + " w : " + str(w) + " h : " + str(h) + " mean_color : " + str(mean_color) + " lower_threshold_white : " + str(lower_threshold_white) + " min_area_box_unread : " + str(min_area_box_unread) + " current_area > min_area_box_unread : " + str(current_area > min_area_box_unread) + " mean_color < lower_threshold_white : " + str(mean_color < lower_threshold_white)) 

137 if current_area > min_area_box_unread and mean_color < lower_threshold_white: 

138 list_boxes_hide_non_white.append(one_box) 

139 

140 print(" We have not yet rewrite image !") 

141 

142 return image, list_boxes_hide_non_white 

143 

144 

145 

146def split_name_or_adresse(list_nom, 

147 exclude_word_split = ["du","de","d'", "patient", "patiente", "place"], 

148 exclude_bib_start = []): 

149 new_list = [] 

150 for nom in list_nom: 

151 list_sub_nom = nom.lower().split(" ") 

152 #if len(list_sub_nom) > 5: 

153 # new_list.append(nom) 

154 # print(" We have too much words in " + str(nom) + " we keep it as is ! ") 

155 # continue 

156 start_with_exclude = False 

157 # Tout ceci aurait du etre fait lors de la selection des noms 

158 for excl in exclude_bib_start: 

159 if nom.lower().startswith(excl.lower()) or " " + excl.lower() in nom.lower(): 

160 start_with_exclude = True 

161 if start_with_exclude: 

162 continue 

163 for sub_nom in list_sub_nom: 

164 if sub_nom not in exclude_word_split and len(sub_nom) > 2: 

165 new_list.append(sub_nom) 

166# new_list.append(sub_nom + ",") 

167 

168# for i in range(len(list_sub_nom)): 

169# for j in range(i+1, len(list_sub_nom)): 

170# new_potential_name = " ".join(list_sub_nom[i:j]) 

171# if new_potential_name not in word_to_keep and new_potential_name not in exclude_word_split 

172# new_list.append(new_potential_name) 

173 

174 # TODO unicize new_list VR 20-4-24 

175 return new_list 

176 

177 

178 

179def write_image_with_anon_text(source_image, source_image_anon, 

180 list_text_to_write, verbose = False, 

181 strat_replace_private_data = "replace_by_keyword"): 

182 print(" TO TEST ! ") 

183 import cv2 

184 img = cv2.imread(source_image) 

185 for text_to_write in list_text_to_write: 

186 write_text_on_image(img, text_to_write["text"], text_to_write["xmin"], text_to_write["ymin"], 

187 text_to_write["xmax"], text_to_write["ymax"], old_text=text_to_write["old_text"]) 

188 cv2.imwrite(source_image_anon, img) 

189 

190 

191 

192def anon_document(result_info_to_anon, list_page_content, 

193 verbose = False, 

194 keyword = {}, 

195 hash_id_treatment = "default_anon_document_warning", 

196 prefix_file = "", 

197 word_to_keep = ["patient", "patiente"], 

198 exclude_word_split = ["de","d'","du"], 

199 word_trigger_hide_all_page = ["Carte Nationale d'identite"], 

200 anon_all_unread=False, 

201 remove_search_string_for_key_private_data=[], 

202 exclude_pers = ["docteur"], 

203 exclude_bib_start = [], 

204 out_folder = None, 

205 paragraphs_to_anon = []): # TODO voila plop 

206 from lib.lib_util import parse_json_from_prompt_result 

207 json_info_to_anon = parse_json_from_prompt_result(result_info_to_anon, verbose = verbose, lazy = False) 

208 

209 if type(json_info_to_anon) == list and len(json_info_to_anon) == 1: 

210 json_info_to_anon = json_info_to_anon[0] 

211 else: 

212 print("WARNING : json_info_to_anon seems to have more than one element, we take the first one !") 

213 # Test if json_info_to_anon is a list of dict 

214 if type(json_info_to_anon) == list: 

215 if len(json_info_to_anon) == 0: 

216 print("WARNING : json_info_to_anon is empty, we set an empty dict !") 

217 json_info_to_anon = {} 

218 elif len(json_info_to_anon) > 0 and type(json_info_to_anon[0]) == dict: 

219 json_info_to_anon = json_info_to_anon[0] 

220 if len(json_info_to_anon) > 1: 

221 print("WARNING MISSING DATA : json_info_to_anon seems to have more than one element, we take the first one !" + str(json_info_to_anon)) 

222 else: 

223 print("WARNING : json_info_to_anon is a list but not of dict ERROR raised !") 

224 print("json_info_to_anon : " + str(json_info_to_anon)) 

225 # il faut lancer une exception 

226 raise Exception("json_info_to_anon is a list but not of dict ERROR raised !") 

227 

228 unsplit_json_info_to_anon = json_info_to_anon.copy() 

229 

230 # Split name and address 

231 if "nom" in json_info_to_anon: 

232 json_info_to_anon["nom"] = split_name_or_adresse(json_info_to_anon["nom"], exclude_word_split = exclude_word_split) 

233 if "prenom" in json_info_to_anon: 

234 json_info_to_anon["prenom"] = split_name_or_adresse(json_info_to_anon["prenom"], exclude_word_split = exclude_word_split) 

235 if "adresse" in json_info_to_anon: 

236 json_info_to_anon["adresse"] = split_name_or_adresse(json_info_to_anon["adresse"], exclude_word_split = exclude_word_split) 

237 if "PERS" in json_info_to_anon: 

238 json_info_to_anon["PERS"] = split_name_or_adresse(json_info_to_anon["PERS"], exclude_word_split = exclude_word_split, exclude_bib_start = exclude_bib_start) 

239 

240 map_to_anon, list_word_contains = map_index_to_anon(json_info_to_anon, verbose = verbose, remove_search_string_for_key_private_data = remove_search_string_for_key_private_data ) 

241 for k in keyword: # TODO je ne sais pas a quoi sert cette boucle VR 20-2-24 

242 map_to_anon[k] = "XXXXXX" 

243 

244 list_images_anon = [] 

245 

246 # anon_all_unread 

247 

248 if len(paragraphs_to_anon) < len(list_page_content): 

249 paragraphs_to_anon = paragraphs_to_anon + [[]] * (len(list_page_content) - len(paragraphs_to_anon)) 

250 for (sub_doc_page, par_to_anon_one_page) in zip(list_page_content, paragraphs_to_anon): 

251 if verbose: 

252 print("sub_doc_page : " + str(sub_doc_page)) 

253 

254 list_boxes_read = sub_doc_page.list_boxes 

255 list_boxes_read = sub_doc_page.list_blocks["paragraphs"] 

256 list_text_to_write = [] 

257 

258 list_text_to_write = par_to_anon_one_page.copy() 

259 

260 for box in list_boxes_read: 

261 if verbose: 

262 print("box : " + str(box)) 

263 test_to_find = box["text"].lower().rstrip(" ").lstrip(" ") 

264 

265 # just looking for docteur 

266 if len(test_to_find) < 30: 

267 for word in exclude_pers: 

268 if word in test_to_find: 

269 break 

270 if test_to_find in map_to_anon: 

271 xmin = box["x"] 

272 ymin = box["y"] 

273 xmax = xmin + box["w"] 

274 ymax = ymin + box["h"] 

275 new_text = {"text" : map_to_anon[test_to_find], 

276 "xmin" : int(xmin), 

277 "xmax" : int(xmax), 

278 "ymin" : int(ymin), 

279 "ymax" : int(ymax), 

280 "old_text" : box["text"]} 

281# "font_size" : box.font_size} 

282 list_text_to_write.append(new_text) 

283 # print(" TODO add y or z or voila") 

284 else : 

285 # CEci ne semble servir qu'à l'anonymisation après gcp_doc_ai ou on exporte les paragraph et pas seulement les mots ! 

286 for word in list_word_contains: 

287 if ("conf" not in box or box["conf"] >= 0) and test_to_find != "" and word in box["text"].lower(): # VR TODO il faudra faire une verification que c'es tun mot complet pour eviter si un nom est une partie d'un autre mot courant 

288 xmin = box["x"] 

289 ymin = box["y"] 

290 xmax = xmin + box["w"] 

291 ymax = ymin + box["h"] 

292 new_text_str = box["text"].lower().replace(word, map_to_anon[word]) 

293 new_text = {"text" : map_to_anon[word], # VR TODO a corriger 

294 "xmin" : int(xmin), 

295 "xmax" : int(xmax), 

296 "ymin" : int(ymin), 

297 "ymax" : int(ymax), 

298 "old_text" : new_text_str} 

299 list_text_to_write.append(new_text) 

300 break 

301 

302 source_image_anon = sub_doc_page.source_image + "_anon.png" 

303 list_images_anon.append(source_image_anon) 

304 write_image_with_anon_text(sub_doc_page.source_image, source_image_anon, 

305 list_text_to_write, verbose = verbose, strat_replace_private_data = "replace_by_keyword") 

306 

307 # remove first box that is on all the page 

308 if len(list_boxes_read) > 0: 

309 print("TODO check that is is correct and get size of all the page") 

310 list_boxes_read = list_boxes_read[1:] 

311 

312 output_image_anon_not_read = source_image_anon + "_not_read.png" 

313 

314 image_very_anon_np, list_boxes_hide_non_white = detect_not_read_text_then_find_mask_all_not_read(sub_doc_page.source_image, 

315 output_image_anon_not_read, 

316 list_boxes_read, 

317 verbose=False, 

318 lower_threshold_white=250) 

319 

320 if anon_all_unread: 

321 import cv2 

322 write_blank_on_read_text_then_find_mask_all_not_read(source_image_anon, source_image_anon, list_boxes_hide_non_white, verbose = verbose) 

323 

324 if "paragraphs" not in sub_doc_page.list_blocks: 

325 sub_doc_page.list_blocks["paragraphs"] = [] 

326 sub_doc_page.list_blocks["paragraphs"].extend(list_boxes_hide_non_white) 

327 

328 print(" TOTEST : TODO VR 15-6-23 : concat list_images_anon in pdf ! ") 

329 

330 from PIL import Image # install by > python3 -m pip install --upgrade Pillow # ref. https://pillow.readthedocs.io/en/latest/installation.html#basic-installation 

331 

332 pdf_name = "anon_" + prefix_file + "_" + hash_id_treatment + "_anon.pdf" 

333 import os 

334 out_folder = out_folder if out_folder != None else os.path.dirname(list_images_anon[0]) 

335 pdf_path = os.path.join(out_folder, pdf_name) 

336 

337 images = [ 

338 Image.open(f) 

339 for f in list_images_anon 

340 ] 

341 

342 images[0].save( 

343 pdf_path, "PDF", resolution=100.0, quality=70, save_all=True, append_images=images[1:] 

344 ) 

345 

346 return list_images_anon, pdf_path, unsplit_json_info_to_anon 

347 

348