Coverage for lib/anon/lib

3def map_index_to_anon(json_info_to_anon,

4 verbose = False,

5 remove_search_string_for_key_private_data = []):

6 map_to_anon = {}

7 list_text_contains_to_find = []

8 for key in json_info_to_anon:

9 for word in json_info_to_anon[key]:

10 if word.lower() in map_to_anon:

11 print("WARNING : word " + str(word) + " already in map_to_anon but seemlingly with different ")

12 else:

13 map_to_anon[word.lower()] = key

14 if key in remove_search_string_for_key_private_data:

15 list_text_contains_to_find.extend(json_info_to_anon[key])

17 list_text_contains_to_find_ordered_by_decreasing_length = sorted(list_text_contains_to_find, key = lambda x : len(x), reverse = True)

19 return map_to_anon, list_text_contains_to_find_ordered_by_decreasing_length

21def write_text_on_image(image, text, xmin, ymin,

22 xmax = None,

23 ymax = None,

24 color = (0, 0, 0),

25font_size = 5,

26 old_text = ""):

27 import cv2

28 font = cv2.FONT_HERSHEY_SIMPLEX

30 if xmax != None and ymax != None:

31 #print(" TODO We want to compute font_size automatically, to be done ! ")

32 font_size = (ymax - ymin) / 20

34# shape = [(xmax, ymax), (xmin, ymin)]

35 # create rectangle image

36# img1 = ImageDraw.Draw(img)

37# img1.rectangle(shape, fill="# ffff33", outline="red")

39# x, y, w, h = cv2.boundingRect(contour)

40 cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (255, 200, 255), -1)

41 # cv2.putText(image, text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)

43 if len(text) > len(old_text):

44 text = text[0:len(old_text)]

46 cv2.putText(image, text, (xmin, ymax), font, font_size, color, 2, cv2.LINE_AA)

48# from PIL import Image

49# from PIL import ImageFont

50# from PIL import ImageDraw

51#

52# img = Image.open("sample_in.jpg")

53# draw = ImageDraw.Draw(img)

54# # font = ImageFont.truetype(<font-file>, <font-size>)

55# font = ImageFont.truetype("sans-serif.ttf", 16)

56# # draw.text((x, y),"Sample Text",(r,g,b))

57# draw.text((0, 0),"Sample Text",(255,255,255),font=font)

58# img.save('sample-out.jpg')

61def write_blank_on_read_text_then_find_mask_all_not_read(source_image, output_image, list_boxes_unread, verbose = False):

62 import cv2

63 image = cv2.imread(source_image)

65 for text_to_write in list_boxes_unread:

66 xmin = text_to_write["x"]

67 ymin = text_to_write["y"]

68 xmax = xmin + text_to_write["w"]

69 ymax = ymin + text_to_write["h"]

70 cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (255, 255, 200), -1)

72 cv2.imwrite(output_image, image)

75def detect_not_read_text_then_find_mask_all_not_read(source_image,

76 output_image_anon_not_read,

77 list_boxes_read, verbose = False,

78 lower_threshold_white = 250,

79 param_threshold = {}):

80 min_area_box_unread = param_threshold["min_area_box_unread"] if "min_area_box_unread" in param_threshold else 100

81 min_pytesseract_confidence = param_threshold["min_pytesseract_confidence"] if "min_pytesseract_confidence" in param_threshold else 40

82 import cv2

84 image = cv2.imread(source_image)

85 for text_to_write in list_boxes_read:

86 xmin = text_to_write["x"]

87 xmax = text_to_write["x"] + text_to_write["w"]

88 ymin = text_to_write["y"]

89 ymax = text_to_write["y"] + text_to_write["h"]

90 # max(list(map(lambda x: x["y"], text_to_write["vertices"])))

92 if text_to_write["text"] != "" and ("conf" not in text_to_write or text_to_write["conf"] > min_pytesseract_confidence):

93 cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (255, 200, 255), -1)

95 cv2.imwrite(output_image_anon_not_read + "_read_color.png", image)

97 image = cv2.imread(source_image)

98 for text_to_write in list_boxes_read:

99 xmin = text_to_write["x"]

100 xmax = text_to_write["x"] + text_to_write["w"]

101 ymin = text_to_write["y"]

102 ymax = text_to_write["y"] + text_to_write["h"]

103 if text_to_write["text"] != "" and ("conf" not in text_to_write or text_to_write["conf"] > min_pytesseract_confidence):

104 cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (255, 255, 255), -1)

105

106 cv2.imwrite(output_image_anon_not_read, image)

107

108 gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

109

110 # threshold input image as mask

111 mask = cv2.threshold(gray, 240, 240, cv2.THRESH_BINARY)[1]

112

113 list_boxes_hide_non_white = []

114 # uses scikit-image to find rectangle boxes with non white pixels

115 from skimage import measure

116 #from scikit-image import measure

117 import numpy as np

118 # find contours in the binary image

119 contours = measure.find_contours(mask, 0.8)

120 # create a mask with the same shape as the image

121 mask = np.zeros_like(mask, dtype=np.uint8)

122 # loop over the contours

123 for contour in contours:

124 # draw the contour on the mask

125 cv2.drawContours(mask, [contour.astype(int)], -1, 255, -1)

126 # get the bounding box of the contour

127 list_xy = contour.astype(int).ravel().tolist()

128 x = min(list_xy[1::2])

129 y = min(list_xy[0::2])

130 w = max(list_xy[1::2]) - x

131 h = max(list_xy[0::2]) - y

132 one_box = {"x" : x, "w" : w, "y" : y, "h" : h, "class" : "unread", "text" : ""}

133 mean_color = np.mean(image[y:y+h, x:x+w])

134 current_area = w * h

135 if verbose:

136 print(" current_area : " + str(current_area) + " mean_color : " + str(mean_color) + " x : " + str(x) + " y : " + str(y) + " w : " + str(w) + " h : " + str(h) + " mean_color : " + str(mean_color) + " lower_threshold_white : " + str(lower_threshold_white) + " min_area_box_unread : " + str(min_area_box_unread) + " current_area > min_area_box_unread : " + str(current_area > min_area_box_unread) + " mean_color < lower_threshold_white : " + str(mean_color < lower_threshold_white))

137 if current_area > min_area_box_unread and mean_color < lower_threshold_white:

138 list_boxes_hide_non_white.append(one_box)

139

140 print(" We have not yet rewrite image !")

141

142 return image, list_boxes_hide_non_white

143

144

145

146def split_name_or_adresse(list_nom,

147 exclude_word_split = ["du","de","d'", "patient", "patiente", "place"],

148 exclude_bib_start = []):

149 new_list = []

150 for nom in list_nom:

151 list_sub_nom = nom.lower().split(" ")

152 #if len(list_sub_nom) > 5:

153 # new_list.append(nom)

154 # print(" We have too much words in " + str(nom) + " we keep it as is ! ")

155 # continue

156 start_with_exclude = False

157 # Tout ceci aurait du etre fait lors de la selection des noms

158 for excl in exclude_bib_start:

159 if nom.lower().startswith(excl.lower()) or " " + excl.lower() in nom.lower():

160 start_with_exclude = True

161 if start_with_exclude:

162 continue

163 for sub_nom in list_sub_nom:

164 if sub_nom not in exclude_word_split and len(sub_nom) > 2:

165 new_list.append(sub_nom)

166# new_list.append(sub_nom + ",")

167

168# for i in range(len(list_sub_nom)):

169# for j in range(i+1, len(list_sub_nom)):

170# new_potential_name = " ".join(list_sub_nom[i:j])

171# if new_potential_name not in word_to_keep and new_potential_name not in exclude_word_split

172# new_list.append(new_potential_name)

173

174 # TODO unicize new_list VR 20-4-24

175 return new_list

176

177

178

179def write_image_with_anon_text(source_image, source_image_anon,

180 list_text_to_write, verbose = False,

181 strat_replace_private_data = "replace_by_keyword"):

182 print(" TO TEST ! ")

183 import cv2

184 img = cv2.imread(source_image)

185 for text_to_write in list_text_to_write:

186 write_text_on_image(img, text_to_write["text"], text_to_write["xmin"], text_to_write["ymin"],

187 text_to_write["xmax"], text_to_write["ymax"], old_text=text_to_write["old_text"])

188 cv2.imwrite(source_image_anon, img)

189

190

191

192def anon_document(result_info_to_anon, list_page_content,

193 verbose = False,

194 keyword = {},

195 hash_id_treatment = "default_anon_document_warning",

196 prefix_file = "",

197 word_to_keep = ["patient", "patiente"],

198 exclude_word_split = ["de","d'","du"],

199 word_trigger_hide_all_page = ["Carte Nationale d'identite"],

200 anon_all_unread=False,

201 remove_search_string_for_key_private_data=[],

202 exclude_pers = ["docteur"],

203 exclude_bib_start = [],

204 out_folder = None,

205 paragraphs_to_anon = []): # TODO voila plop

206 from lib.lib_util import parse_json_from_prompt_result

207 json_info_to_anon = parse_json_from_prompt_result(result_info_to_anon, verbose = verbose, lazy = False)

208

209 if type(json_info_to_anon) == list and len(json_info_to_anon) == 1:

210 json_info_to_anon = json_info_to_anon[0]

211 else:

212 print("WARNING : json_info_to_anon seems to have more than one element, we take the first one !")

213 # Test if json_info_to_anon is a list of dict

214 if type(json_info_to_anon) == list:

215 if len(json_info_to_anon) == 0:

216 print("WARNING : json_info_to_anon is empty, we set an empty dict !")

217 json_info_to_anon = {}

218 elif len(json_info_to_anon) > 0 and type(json_info_to_anon[0]) == dict:

219 json_info_to_anon = json_info_to_anon[0]

220 if len(json_info_to_anon) > 1:

221 print("WARNING MISSING DATA : json_info_to_anon seems to have more than one element, we take the first one !" + str(json_info_to_anon))

222 else:

223 print("WARNING : json_info_to_anon is a list but not of dict ERROR raised !")

224 print("json_info_to_anon : " + str(json_info_to_anon))

225 # il faut lancer une exception

226 raise Exception("json_info_to_anon is a list but not of dict ERROR raised !")

227

228 unsplit_json_info_to_anon = json_info_to_anon.copy()

229

230 # Split name and address

231 if "nom" in json_info_to_anon:

232 json_info_to_anon["nom"] = split_name_or_adresse(json_info_to_anon["nom"], exclude_word_split = exclude_word_split)

233 if "prenom" in json_info_to_anon:

234 json_info_to_anon["prenom"] = split_name_or_adresse(json_info_to_anon["prenom"], exclude_word_split = exclude_word_split)

235 if "adresse" in json_info_to_anon:

236 json_info_to_anon["adresse"] = split_name_or_adresse(json_info_to_anon["adresse"], exclude_word_split = exclude_word_split)

237 if "PERS" in json_info_to_anon:

238 json_info_to_anon["PERS"] = split_name_or_adresse(json_info_to_anon["PERS"], exclude_word_split = exclude_word_split, exclude_bib_start = exclude_bib_start)

239

240 map_to_anon, list_word_contains = map_index_to_anon(json_info_to_anon, verbose = verbose, remove_search_string_for_key_private_data = remove_search_string_for_key_private_data )

241 for k in keyword: # TODO je ne sais pas a quoi sert cette boucle VR 20-2-24

242 map_to_anon[k] = "XXXXXX"

243

244 list_images_anon = []

245

246 # anon_all_unread

247

248 if len(paragraphs_to_anon) < len(list_page_content):

249 paragraphs_to_anon = paragraphs_to_anon + [[]] * (len(list_page_content) - len(paragraphs_to_anon))

250 for (sub_doc_page, par_to_anon_one_page) in zip(list_page_content, paragraphs_to_anon):

251 if verbose:

252 print("sub_doc_page : " + str(sub_doc_page))

253

254 list_boxes_read = sub_doc_page.list_boxes

255 list_boxes_read = sub_doc_page.list_blocks["paragraphs"]

256 list_text_to_write = []

257

258 list_text_to_write = par_to_anon_one_page.copy()

259

260 for box in list_boxes_read:

261 if verbose:

262 print("box : " + str(box))

263 test_to_find = box["text"].lower().rstrip(" ").lstrip(" ")

264

265 # just looking for docteur

266 if len(test_to_find) < 30:

267 for word in exclude_pers:

268 if word in test_to_find:

269 break

270 if test_to_find in map_to_anon:

271 xmin = box["x"]

272 ymin = box["y"]

273 xmax = xmin + box["w"]

274 ymax = ymin + box["h"]

275 new_text = {"text" : map_to_anon[test_to_find],

276 "xmin" : int(xmin),

277 "xmax" : int(xmax),

278 "ymin" : int(ymin),

279 "ymax" : int(ymax),

280 "old_text" : box["text"]}

281# "font_size" : box.font_size}

282 list_text_to_write.append(new_text)

283 # print(" TODO add y or z or voila")

284 else :

285 # CEci ne semble servir qu'à l'anonymisation après gcp_doc_ai ou on exporte les paragraph et pas seulement les mots !

286 for word in list_word_contains:

287 if ("conf" not in box or box["conf"] >= 0) and test_to_find != "" and word in box["text"].lower(): # VR TODO il faudra faire une verification que c'es tun mot complet pour eviter si un nom est une partie d'un autre mot courant

288 xmin = box["x"]

289 ymin = box["y"]

290 xmax = xmin + box["w"]

291 ymax = ymin + box["h"]

292 new_text_str = box["text"].lower().replace(word, map_to_anon[word])

293 new_text = {"text" : map_to_anon[word], # VR TODO a corriger

294 "xmin" : int(xmin),

295 "xmax" : int(xmax),

296 "ymin" : int(ymin),

297 "ymax" : int(ymax),

298 "old_text" : new_text_str}

299 list_text_to_write.append(new_text)

300 break

301

302 source_image_anon = sub_doc_page.source_image + "_anon.png"

303 list_images_anon.append(source_image_anon)

304 write_image_with_anon_text(sub_doc_page.source_image, source_image_anon,

305 list_text_to_write, verbose = verbose, strat_replace_private_data = "replace_by_keyword")

306

307 # remove first box that is on all the page

308 if len(list_boxes_read) > 0:

309 print("TODO check that is is correct and get size of all the page")

310 list_boxes_read = list_boxes_read[1:]

311

312 output_image_anon_not_read = source_image_anon + "_not_read.png"

313

314 image_very_anon_np, list_boxes_hide_non_white = detect_not_read_text_then_find_mask_all_not_read(sub_doc_page.source_image,

315 output_image_anon_not_read,

316 list_boxes_read,

317 verbose=False,

318 lower_threshold_white=250)

319

320 if anon_all_unread:

321 import cv2

322 write_blank_on_read_text_then_find_mask_all_not_read(source_image_anon, source_image_anon, list_boxes_hide_non_white, verbose = verbose)

323

324 if "paragraphs" not in sub_doc_page.list_blocks:

325 sub_doc_page.list_blocks["paragraphs"] = []

326 sub_doc_page.list_blocks["paragraphs"].extend(list_boxes_hide_non_white)

327

328 print(" TOTEST : TODO VR 15-6-23 : concat list_images_anon in pdf ! ")

329

330 from PIL import Image # install by > python3 -m pip install --upgrade Pillow # ref. https://pillow.readthedocs.io/en/latest/installation.html#basic-installation

331

332 pdf_name = "anon_" + prefix_file + "_" + hash_id_treatment + "_anon.pdf"

333 import os

334 out_folder = out_folder if out_folder != None else os.path.dirname(list_images_anon[0])

335 pdf_path = os.path.join(out_folder, pdf_name)

336

337 images = [

338 Image.open(f)

339 for f in list_images_anon

340 ]

341

342 images[0].save(

343 pdf_path, "PDF", resolution=100.0, quality=70, save_all=True, append_images=images[1:]

344 )

345

346 return list_images_anon, pdf_path, unsplit_json_info_to_anon

347

348

Coverage for lib/anon/lib_anon.py: 78%

184 statements