Skip to content

Commit e10b9bd

Browse files
committed
feature: add tag_regexes parameter to add more metadata as a
ShopifyOption
1 parent 934e3a6 commit e10b9bd

File tree

4 files changed

+47
-35
lines changed

4 files changed

+47
-35
lines changed

server/src/bin/crawl-worker.rs

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,12 @@ use diesel_async::pooled_connection::{AsyncDieselConnectionManager, ManagerConfi
33
use sentry::{Hub, SentryFutureExt};
44
use serde::{Deserialize, Serialize};
55
use signal_hook::consts::SIGTERM;
6-
use std::sync::{
7-
atomic::{AtomicBool, Ordering},
8-
Arc,
6+
use std::{
7+
collections::HashSet,
8+
sync::{
9+
atomic::{AtomicBool, Ordering},
10+
Arc,
11+
},
912
};
1013
use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt, EnvFilter, Layer};
1114
use trieve_server::{
@@ -102,11 +105,45 @@ fn create_chunk_req_payload(
102105
base_url, product.handle, variant.id
103106
);
104107

105-
let chunk_html = format!(
108+
let mut chunk_html = format!(
106109
"<h1>{} - {}</h1>{}",
107110
product.title, variant.title, product.body_html
108111
);
109112

113+
if let Some(ScrapeOptions::Shopify(CrawlShopifyOptions {
114+
tag_regexes: Some(tag_regexes),
115+
..
116+
})) = scrape_request.crawl_options.scrape_options.clone()
117+
{
118+
// Create al regexes, if the regex is invalid, skip it
119+
let regexes: Vec<(regex::Regex, String)> = tag_regexes
120+
.iter()
121+
.filter_map(|pattern| {
122+
regex::Regex::new(pattern)
123+
.ok()
124+
.map(|regex| (regex, pattern.to_string()))
125+
})
126+
.collect();
127+
128+
// Go through all the tags, and find the ones that match the regexes
129+
let tags_string: String = product
130+
.tags
131+
.iter()
132+
.filter_map(|tag| {
133+
// Add the pattern if the tag matches the regex
134+
regexes
135+
.iter()
136+
.find(|(regex, _)| regex.is_match(tag))
137+
.map(|(_, pattern)| format!("<span>{}</span>", pattern.clone()))
138+
})
139+
.collect::<HashSet<String>>()
140+
.into_iter()
141+
.collect::<Vec<String>>()
142+
.join("");
143+
144+
chunk_html = format!("<div>{}</div>\n\n<div>{}</div>", chunk_html, tags_string);
145+
}
146+
110147
let group_variants = if let Some(ScrapeOptions::Shopify(CrawlShopifyOptions {
111148
group_variants: Some(group_variants),
112149
..
@@ -558,8 +595,9 @@ async fn crawl(
558595

559596
let dataset_config = DatasetConfiguration::from_json(dataset.server_configuration.clone());
560597

598+
log::info!("Starting crawl for scrape_id: {}", scrape_request.id);
561599
let (page_count, chunks_created) = if let Some(ScrapeOptions::Shopify(_)) =
562-
scrape_request.crawl_options.scrape_options
600+
scrape_request.crawl_options.scrape_options.clone()
563601
{
564602
let mut cur_page = 1;
565603
let mut chunk_count = 0;

server/src/data/models.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7094,6 +7094,7 @@ pub enum ScrapeOptions {
70947094
pub struct CrawlShopifyOptions {
70957095
/// This option will ingest all variants as individual chunks and place them in groups by product id. Turning this off will only scrape 1 variant per product. default: true
70967096
pub group_variants: Option<bool>,
7097+
pub tag_regexes: Option<Vec<String>>,
70977098
}
70987099

70997100
impl CrawlOptions {

server/src/operators/crawl_operator.rs

Lines changed: 0 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -156,35 +156,6 @@ pub async fn crawl(
156156
Ok(scrape_id)
157157
}
158158

159-
pub async fn get_crawl_request(
160-
crawl_id: uuid::Uuid,
161-
pool: web::Data<Pool>,
162-
) -> Result<CrawlRequest, ServiceError> {
163-
use crate::data::schema::crawl_requests::dsl::*;
164-
let mut conn = pool
165-
.get()
166-
.await
167-
.map_err(|e| ServiceError::InternalServerError(e.to_string()))?;
168-
let request = crawl_requests
169-
.select((
170-
id,
171-
url,
172-
status,
173-
next_crawl_at,
174-
interval,
175-
crawl_options,
176-
scrape_id,
177-
dataset_id,
178-
created_at,
179-
))
180-
.filter(scrape_id.eq(crawl_id))
181-
.first::<CrawlRequestPG>(&mut conn)
182-
.await
183-
.map_err(|e| ServiceError::InternalServerError(e.to_string()))?;
184-
185-
Ok(request.into())
186-
}
187-
188159
pub async fn get_crawl_request_by_dataset_id_query(
189160
dataset_id: uuid::Uuid,
190161
pool: web::Data<Pool>,

server/src/operators/qdrant_operator.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,9 @@ pub async fn create_new_qdrant_collection_query(
109109
let collection = qdrant_client
110110
.collection_exists(collection_name.clone())
111111
.await
112-
.map_err(|e| ServiceError::BadRequest(e.to_string()))?;
112+
.map_err(|e| {
113+
ServiceError::BadRequest(format!("Failed to see if collection exists {}", e))
114+
})?;
113115

114116
match collection {
115117
true => log::info!("Avoided creating collection as it already exists"),

0 commit comments

Comments
 (0)