Coverage for lib/stockage/lib_pg_dataset_pg.py: 33%

3 statements  

« prev     ^ index     » next       coverage.py v7.9.1, created at 2026-01-26 23:58 +0100

1 

2 

3def build_query_create_documents(table_name, match_page_sections_function_name, hash40): 

4 

5 query = """ 

6 

7-- must be done by superuser, that we don't want the safia www to have access to 

8-- create extension vector; 

9 

10 create table if not exists """ + table_name + """ ( 

11 id text primary key default gen_random_uuid()::text, 

12 source text, 

13 source_id text, 

14 content text, 

15 document_id text, 

16 author text, 

17 url text, 

18 created_at timestamptz default now(), 

19 embedding vector(1536) 

20 ); 

21 

22 create index """ + hash40 + """_ix_documents_document_id on """ + table_name + """ using btree ( document_id ); 

23 create index """ + hash40 + """_ix_documents_source on """ + table_name + """ using btree ( source ); 

24 create index """ + hash40 + """_ix_documents_source_id on """ + table_name + """ using btree ( source_id ); 

25 create index """ + hash40 + """_ix_documents_author on """ + table_name + """ using btree ( author ); 

26 create index """ + hash40 + """_ix_documents_created_at on """ + table_name + """ using brin ( created_at ); 

27 

28 alter table """ + table_name + """ enable row level security; 

29 

30 create or replace function """ + match_page_sections_function_name + """(in_embedding vector(1536) 

31 , in_match_count int default 3 

32 , in_document_id text default '%%' 

33 , in_source_id text default '%%' 

34 , in_source text default '%%' 

35 , in_author text default '%%' 

36 , in_start_date timestamptz default '-infinity' 

37 , in_end_date timestamptz default 'infinity') 

38 returns table (id text 

39 , source text 

40 , source_id text 

41 , document_id text 

42 , url text 

43 , created_at timestamptz 

44 , author text 

45 , content text 

46 , embedding vector(1536) 

47 , similarity float) 

48 language plpgsql 

49 as $$ 

50 #variable_conflict use_variable 

51 begin 

52 return query 

53 select 

54 """ + table_name + """.id, 

55 """ + table_name + """.source, 

56 """ + table_name + """.source_id, 

57 """ + table_name + """.document_id, 

58 """ + table_name + """.url, 

59 """ + table_name + """.created_at, 

60 """ + table_name + """.author, 

61 """ + table_name + """.content, 

62 """ + table_name + """.embedding, 

63 (""" + table_name + """.embedding <#> in_embedding) * -1 as similarity 

64 from """ + table_name + """ 

65 

66 where in_start_date <= """ + table_name + """.created_at and 

67 """ + table_name + """.created_at <= in_end_date and 

68 (""" + table_name + """.source_id like in_source_id or """ + table_name + """.source_id is null) and 

69 (""" + table_name + """.source like in_source or """ + table_name + """.source is null) and 

70 (""" + table_name + """.author like in_author or """ + table_name + """.author is null) and 

71 (""" + table_name + """.document_id like in_document_id or """ + table_name + """.document_id is null) 

72 

73 order by """ + table_name + """.embedding <#> in_embedding 

74 

75 limit in_match_count; 

76 end; 

77 $$;""" 

78 

79 return query