From ad60ec1de68de8fb77638d93bae6d5d087d8f0c7 Mon Sep 17 00:00:00 2001 From: Dens Sumesh Date: Mon, 22 Jul 2024 20:39:04 -0700 Subject: [PATCH] feature: instead of deleting rows, mark them as duplicate --- docker/clustering-script/get_clusters.py | 2 +- docker/collapse-query-script/collapse_queries.py | 7 ++++--- .../1721705174_mark_rows_as_duplicates/down.sql | 2 ++ .../1721705174_mark_rows_as_duplicates/up.sql | 2 ++ server/src/operators/analytics_operator.rs | 12 ++++++------ 5 files changed, 15 insertions(+), 10 deletions(-) create mode 100644 server/ch_migrations/1721705174_mark_rows_as_duplicates/down.sql create mode 100644 server/ch_migrations/1721705174_mark_rows_as_duplicates/up.sql diff --git a/docker/clustering-script/get_clusters.py b/docker/clustering-script/get_clusters.py index 447e54b18c..b6617ab29b 100644 --- a/docker/clustering-script/get_clusters.py +++ b/docker/clustering-script/get_clusters.py @@ -22,7 +22,7 @@ def fetch_dataset_vectors( SELECT id, query, top_score, query_vector FROM default.search_queries WHERE dataset_id = '{}' - AND created_at >= now() - INTERVAL 7 DAY + AND created_at >= now() - INTERVAL 7 DAY AND is_duplicate = 0 ORDER BY rand() LIMIT {} """.format( diff --git a/docker/collapse-query-script/collapse_queries.py b/docker/collapse-query-script/collapse_queries.py index cbff512f1d..9f5f394747 100644 --- a/docker/collapse-query-script/collapse_queries.py +++ b/docker/collapse-query-script/collapse_queries.py @@ -16,7 +16,7 @@ def get_search_queries( query = """ SELECT id, query, top_score, created_at FROM default.search_queries - WHERE dataset_id = '{}' + WHERE dataset_id = '{}' AND is_duplicate = 0 ORDER BY created_at, length(query) LIMIT {} """.format( @@ -28,7 +28,7 @@ def get_search_queries( SELECT id, query, top_score, created_at FROM default.search_queries WHERE dataset_id = '{}' - AND created_at >= '{}' + AND created_at >= '{}' AND is_duplicate = 0 ORDER BY created_at, length(query) LIMIT {} """.format( @@ -130,7 +130,8 @@ def collapse_queries(rows): def delete_queries(client: clickhouse_connect.driver.client.Client, rows): for row in rows: query = """ - DELETE FROM default.search_queries + ALTER TABLE default.search_queries + UPDATE is_duplicate = 1 WHERE id = '{}' """.format( str(row[0]) diff --git a/server/ch_migrations/1721705174_mark_rows_as_duplicates/down.sql b/server/ch_migrations/1721705174_mark_rows_as_duplicates/down.sql new file mode 100644 index 0000000000..f007e300f4 --- /dev/null +++ b/server/ch_migrations/1721705174_mark_rows_as_duplicates/down.sql @@ -0,0 +1,2 @@ +ALTER TABLE search_queries +DROP COLUMN IF EXISTS is_duplicate; \ No newline at end of file diff --git a/server/ch_migrations/1721705174_mark_rows_as_duplicates/up.sql b/server/ch_migrations/1721705174_mark_rows_as_duplicates/up.sql new file mode 100644 index 0000000000..44d0716e44 --- /dev/null +++ b/server/ch_migrations/1721705174_mark_rows_as_duplicates/up.sql @@ -0,0 +1,2 @@ +ALTER TABLE search_queries +ADD COLUMN IF NOT EXISTS is_duplicate UInt8 DEFAULT 0; diff --git a/server/src/operators/analytics_operator.rs b/server/src/operators/analytics_operator.rs index 98a5ca10a3..bb919accf7 100644 --- a/server/src/operators/analytics_operator.rs +++ b/server/src/operators/analytics_operator.rs @@ -70,7 +70,7 @@ pub async fn get_queries_for_cluster_query( FROM search_queries JOIN search_cluster_memberships ON search_queries.id = search_cluster_memberships.search_id WHERE search_cluster_memberships.cluster_id = ? - AND search_queries.dataset_id = ? + AND search_queries.dataset_id = ? AND search_queries.is_duplicate = 0 ORDER BY search_cluster_memberships.distance_to_centroid DESC LIMIT 15 @@ -107,7 +107,7 @@ pub async fn get_search_query( clickhouse_client: &clickhouse::Client, ) -> Result { let clickhouse_query = clickhouse_client - .query("SELECT ?fields FROM search_queries WHERE id = ? AND dataset_id = ?") + .query("SELECT ?fields FROM search_queries WHERE id = ? AND dataset_id = ? AND search_queries.is_duplicate = 0") .bind(search_id) .bind(dataset_id) .fetch_one::() @@ -173,7 +173,7 @@ pub async fn get_head_queries_query( count(*) AS count FROM default.search_queries - WHERE dataset_id = ?", + WHERE dataset_id = ? AND search_queries.is_duplicate = 0", ); if let Some(filter) = filter { @@ -218,7 +218,7 @@ pub async fn get_low_confidence_queries_query( ?fields FROM default.search_queries - WHERE dataset_id = ?", + WHERE dataset_id = ? AND search_queries.is_duplicate = 0", ); if let Some(filter) = filter { @@ -279,7 +279,7 @@ pub async fn get_no_result_queries_query( FROM default.search_queries WHERE dataset_id = ? - AND top_score = 0", + AND top_score = 0 AND search_queries.is_duplicate = 0", ); if let Some(filter) = filter { @@ -329,7 +329,7 @@ pub async fn get_all_queries_query( ?fields FROM default.search_queries - WHERE dataset_id = ?", + WHERE dataset_id = ? AND search_queries.is_duplicate = 0", ); if let Some(filter) = filter {