Coverage for lib/stockage/lib_pg_dataset_pg.py: 33%
3 statements
« prev ^ index » next coverage.py v7.9.1, created at 2026-01-26 23:58 +0100
« prev ^ index » next coverage.py v7.9.1, created at 2026-01-26 23:58 +0100
3def build_query_create_documents(table_name, match_page_sections_function_name, hash40):
5 query = """
7-- must be done by superuser, that we don't want the safia www to have access to
8-- create extension vector;
10 create table if not exists """ + table_name + """ (
11 id text primary key default gen_random_uuid()::text,
12 source text,
13 source_id text,
14 content text,
15 document_id text,
16 author text,
17 url text,
18 created_at timestamptz default now(),
19 embedding vector(1536)
20 );
22 create index """ + hash40 + """_ix_documents_document_id on """ + table_name + """ using btree ( document_id );
23 create index """ + hash40 + """_ix_documents_source on """ + table_name + """ using btree ( source );
24 create index """ + hash40 + """_ix_documents_source_id on """ + table_name + """ using btree ( source_id );
25 create index """ + hash40 + """_ix_documents_author on """ + table_name + """ using btree ( author );
26 create index """ + hash40 + """_ix_documents_created_at on """ + table_name + """ using brin ( created_at );
28 alter table """ + table_name + """ enable row level security;
30 create or replace function """ + match_page_sections_function_name + """(in_embedding vector(1536)
31 , in_match_count int default 3
32 , in_document_id text default '%%'
33 , in_source_id text default '%%'
34 , in_source text default '%%'
35 , in_author text default '%%'
36 , in_start_date timestamptz default '-infinity'
37 , in_end_date timestamptz default 'infinity')
38 returns table (id text
39 , source text
40 , source_id text
41 , document_id text
42 , url text
43 , created_at timestamptz
44 , author text
45 , content text
46 , embedding vector(1536)
47 , similarity float)
48 language plpgsql
49 as $$
50 #variable_conflict use_variable
51 begin
52 return query
53 select
54 """ + table_name + """.id,
55 """ + table_name + """.source,
56 """ + table_name + """.source_id,
57 """ + table_name + """.document_id,
58 """ + table_name + """.url,
59 """ + table_name + """.created_at,
60 """ + table_name + """.author,
61 """ + table_name + """.content,
62 """ + table_name + """.embedding,
63 (""" + table_name + """.embedding <#> in_embedding) * -1 as similarity
64 from """ + table_name + """
66 where in_start_date <= """ + table_name + """.created_at and
67 """ + table_name + """.created_at <= in_end_date and
68 (""" + table_name + """.source_id like in_source_id or """ + table_name + """.source_id is null) and
69 (""" + table_name + """.source like in_source or """ + table_name + """.source is null) and
70 (""" + table_name + """.author like in_author or """ + table_name + """.author is null) and
71 (""" + table_name + """.document_id like in_document_id or """ + table_name + """.document_id is null)
73 order by """ + table_name + """.embedding <#> in_embedding
75 limit in_match_count;
76 end;
77 $$;"""
79 return query