|
17 | 17 | from psycopg.types.json import Jsonb
|
18 | 18 | from psycopg import sql as pysql
|
19 | 19 |
|
20 |
| -from ..db.connection import connect, Connection, Cursor, server_version_tuple, \ |
| 20 | +from ..db.connection import connect, Connection, Cursor, \ |
21 | 21 | drop_tables, table_exists, execute_scalar
|
22 | 22 | from ..config import Configuration
|
23 | 23 | from ..db.sql_preprocessor import SQLPreprocessor
|
@@ -110,80 +110,37 @@ def update_statistics(self, config: Configuration, threads: int = 2) -> None:
|
110 | 110 | cur.execute(pysql.SQL('SET max_parallel_workers_per_gather TO {}')
|
111 | 111 | .format(pysql.Literal(min(threads, 6),)))
|
112 | 112 |
|
113 |
| - if server_version_tuple(conn) < (12, 0): |
114 |
| - LOG.info('Computing word frequencies') |
115 |
| - drop_tables(conn, 'word_frequencies', 'addressword_frequencies') |
116 |
| - cur.execute("""CREATE TEMP TABLE word_frequencies AS |
117 |
| - SELECT unnest(name_vector) as id, count(*) |
118 |
| - FROM search_name GROUP BY id""") |
119 |
| - cur.execute('CREATE INDEX ON word_frequencies(id)') |
120 |
| - cur.execute("""CREATE TEMP TABLE addressword_frequencies AS |
121 |
| - SELECT unnest(nameaddress_vector) as id, count(*) |
122 |
| - FROM search_name GROUP BY id""") |
123 |
| - cur.execute('CREATE INDEX ON addressword_frequencies(id)') |
124 |
| - cur.execute(""" |
125 |
| - CREATE OR REPLACE FUNCTION word_freq_update(wid INTEGER, |
126 |
| - INOUT info JSONB) |
127 |
| - AS $$ |
128 |
| - DECLARE rec RECORD; |
129 |
| - BEGIN |
130 |
| - IF info is null THEN |
131 |
| - info = '{}'::jsonb; |
132 |
| - END IF; |
133 |
| - FOR rec IN SELECT count FROM word_frequencies WHERE id = wid |
134 |
| - LOOP |
135 |
| - info = info || jsonb_build_object('count', rec.count); |
136 |
| - END LOOP; |
137 |
| - FOR rec IN SELECT count FROM addressword_frequencies WHERE id = wid |
138 |
| - LOOP |
139 |
| - info = info || jsonb_build_object('addr_count', rec.count); |
140 |
| - END LOOP; |
141 |
| - IF info = '{}'::jsonb THEN |
142 |
| - info = null; |
143 |
| - END IF; |
144 |
| - END; |
145 |
| - $$ LANGUAGE plpgsql IMMUTABLE; |
146 |
| - """) |
147 |
| - LOG.info('Update word table with recomputed frequencies') |
148 |
| - drop_tables(conn, 'tmp_word') |
149 |
| - cur.execute("""CREATE TABLE tmp_word AS |
150 |
| - SELECT word_id, word_token, type, word, |
151 |
| - word_freq_update(word_id, info) as info |
152 |
| - FROM word |
153 |
| - """) |
154 |
| - drop_tables(conn, 'word_frequencies', 'addressword_frequencies') |
155 |
| - else: |
156 |
| - LOG.info('Computing word frequencies') |
157 |
| - drop_tables(conn, 'word_frequencies') |
158 |
| - cur.execute(""" |
159 |
| - CREATE TEMP TABLE word_frequencies AS |
160 |
| - WITH word_freq AS MATERIALIZED ( |
161 |
| - SELECT unnest(name_vector) as id, count(*) |
162 |
| - FROM search_name GROUP BY id), |
163 |
| - addr_freq AS MATERIALIZED ( |
164 |
| - SELECT unnest(nameaddress_vector) as id, count(*) |
165 |
| - FROM search_name GROUP BY id) |
166 |
| - SELECT coalesce(a.id, w.id) as id, |
167 |
| - (CASE WHEN w.count is null THEN '{}'::JSONB |
168 |
| - ELSE jsonb_build_object('count', w.count) END |
169 |
| - || |
170 |
| - CASE WHEN a.count is null THEN '{}'::JSONB |
171 |
| - ELSE jsonb_build_object('addr_count', a.count) END) as info |
172 |
| - FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id; |
173 |
| - """) |
174 |
| - cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)') |
175 |
| - cur.execute('ANALYSE word_frequencies') |
176 |
| - LOG.info('Update word table with recomputed frequencies') |
177 |
| - drop_tables(conn, 'tmp_word') |
178 |
| - cur.execute("""CREATE TABLE tmp_word AS |
179 |
| - SELECT word_id, word_token, type, word, |
180 |
| - (CASE WHEN wf.info is null THEN word.info |
181 |
| - ELSE coalesce(word.info, '{}'::jsonb) || wf.info |
182 |
| - END) as info |
183 |
| - FROM word LEFT JOIN word_frequencies wf |
184 |
| - ON word.word_id = wf.id |
185 |
| - """) |
186 |
| - drop_tables(conn, 'word_frequencies') |
| 113 | + LOG.info('Computing word frequencies') |
| 114 | + drop_tables(conn, 'word_frequencies') |
| 115 | + cur.execute(""" |
| 116 | + CREATE TEMP TABLE word_frequencies AS |
| 117 | + WITH word_freq AS MATERIALIZED ( |
| 118 | + SELECT unnest(name_vector) as id, count(*) |
| 119 | + FROM search_name GROUP BY id), |
| 120 | + addr_freq AS MATERIALIZED ( |
| 121 | + SELECT unnest(nameaddress_vector) as id, count(*) |
| 122 | + FROM search_name GROUP BY id) |
| 123 | + SELECT coalesce(a.id, w.id) as id, |
| 124 | + (CASE WHEN w.count is null THEN '{}'::JSONB |
| 125 | + ELSE jsonb_build_object('count', w.count) END |
| 126 | + || |
| 127 | + CASE WHEN a.count is null THEN '{}'::JSONB |
| 128 | + ELSE jsonb_build_object('addr_count', a.count) END) as info |
| 129 | + FROM word_freq w FULL JOIN addr_freq a ON a.id = w.id; |
| 130 | + """) |
| 131 | + cur.execute('CREATE UNIQUE INDEX ON word_frequencies(id) INCLUDE(info)') |
| 132 | + cur.execute('ANALYSE word_frequencies') |
| 133 | + LOG.info('Update word table with recomputed frequencies') |
| 134 | + drop_tables(conn, 'tmp_word') |
| 135 | + cur.execute("""CREATE TABLE tmp_word AS |
| 136 | + SELECT word_id, word_token, type, word, |
| 137 | + (CASE WHEN wf.info is null THEN word.info |
| 138 | + ELSE coalesce(word.info, '{}'::jsonb) || wf.info |
| 139 | + END) as info |
| 140 | + FROM word LEFT JOIN word_frequencies wf |
| 141 | + ON word.word_id = wf.id |
| 142 | + """) |
| 143 | + drop_tables(conn, 'word_frequencies') |
187 | 144 |
|
188 | 145 | with conn.cursor() as cur:
|
189 | 146 | cur.execute('SET max_parallel_workers_per_gather TO 0')
|
|
0 commit comments