Skip to content

Commit

Permalink
feature: instead of deleting rows, mark them as duplicate
Browse files Browse the repository at this point in the history
  • Loading branch information
densumesh committed Jul 23, 2024
1 parent dfc3349 commit ad60ec1
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 10 deletions.
2 changes: 1 addition & 1 deletion docker/clustering-script/get_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def fetch_dataset_vectors(
SELECT id, query, top_score, query_vector
FROM default.search_queries
WHERE dataset_id = '{}'
AND created_at >= now() - INTERVAL 7 DAY
AND created_at >= now() - INTERVAL 7 DAY AND is_duplicate = 0
ORDER BY rand()
LIMIT {}
""".format(
Expand Down
7 changes: 4 additions & 3 deletions docker/collapse-query-script/collapse_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def get_search_queries(
query = """
SELECT id, query, top_score, created_at
FROM default.search_queries
WHERE dataset_id = '{}'
WHERE dataset_id = '{}' AND is_duplicate = 0
ORDER BY created_at, length(query)
LIMIT {}
""".format(
Expand All @@ -28,7 +28,7 @@ def get_search_queries(
SELECT id, query, top_score, created_at
FROM default.search_queries
WHERE dataset_id = '{}'
AND created_at >= '{}'
AND created_at >= '{}' AND is_duplicate = 0
ORDER BY created_at, length(query)
LIMIT {}
""".format(
Expand Down Expand Up @@ -130,7 +130,8 @@ def collapse_queries(rows):
def delete_queries(client: clickhouse_connect.driver.client.Client, rows):
for row in rows:
query = """
DELETE FROM default.search_queries
ALTER TABLE default.search_queries
UPDATE is_duplicate = 1
WHERE id = '{}'
""".format(
str(row[0])
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ALTER TABLE search_queries
DROP COLUMN IF EXISTS is_duplicate;
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ALTER TABLE search_queries
ADD COLUMN IF NOT EXISTS is_duplicate UInt8 DEFAULT 0;
12 changes: 6 additions & 6 deletions server/src/operators/analytics_operator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ pub async fn get_queries_for_cluster_query(
FROM search_queries
JOIN search_cluster_memberships ON search_queries.id = search_cluster_memberships.search_id
WHERE search_cluster_memberships.cluster_id = ?
AND search_queries.dataset_id = ?
AND search_queries.dataset_id = ? AND search_queries.is_duplicate = 0
ORDER BY
search_cluster_memberships.distance_to_centroid DESC
LIMIT 15
Expand Down Expand Up @@ -107,7 +107,7 @@ pub async fn get_search_query(
clickhouse_client: &clickhouse::Client,
) -> Result<SearchQueryEvent, ServiceError> {
let clickhouse_query = clickhouse_client
.query("SELECT ?fields FROM search_queries WHERE id = ? AND dataset_id = ?")
.query("SELECT ?fields FROM search_queries WHERE id = ? AND dataset_id = ? AND search_queries.is_duplicate = 0")
.bind(search_id)
.bind(dataset_id)
.fetch_one::<SearchQueryEventClickhouse>()
Expand Down Expand Up @@ -173,7 +173,7 @@ pub async fn get_head_queries_query(
count(*) AS count
FROM
default.search_queries
WHERE dataset_id = ?",
WHERE dataset_id = ? AND search_queries.is_duplicate = 0",
);

if let Some(filter) = filter {
Expand Down Expand Up @@ -218,7 +218,7 @@ pub async fn get_low_confidence_queries_query(
?fields
FROM
default.search_queries
WHERE dataset_id = ?",
WHERE dataset_id = ? AND search_queries.is_duplicate = 0",
);

if let Some(filter) = filter {
Expand Down Expand Up @@ -279,7 +279,7 @@ pub async fn get_no_result_queries_query(
FROM
default.search_queries
WHERE dataset_id = ?
AND top_score = 0",
AND top_score = 0 AND search_queries.is_duplicate = 0",
);

if let Some(filter) = filter {
Expand Down Expand Up @@ -329,7 +329,7 @@ pub async fn get_all_queries_query(
?fields
FROM
default.search_queries
WHERE dataset_id = ?",
WHERE dataset_id = ? AND search_queries.is_duplicate = 0",
);

if let Some(filter) = filter {
Expand Down

0 comments on commit ad60ec1

Please sign in to comment.