Skip to content

Commit

Permalink
wip: removing ChunkMetadataWithFileData
Browse files Browse the repository at this point in the history
  • Loading branch information
cdxker committed Apr 9, 2024
1 parent a6b5ca8 commit ffac3fc
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 268 deletions.
53 changes: 1 addition & 52 deletions server/src/data/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -347,40 +347,6 @@ impl ChunkCollision {
}
}

#[derive(Debug, Serialize, Deserialize, Clone, ToSchema)]
#[schema(example=json!({
"id": "e3e3e3e3-e3e3-e3e3-e3e3-e3e3e3e3e3e3",
"content": "Hello, world!",
"link": "https://trieve.ai",
"qdrant_point_id": "e3e3e3e3-e3e3-e3e3-e3e3-e3e3e3e3e3e3",
"created_at": "2021-01-01T00:00:00",
"updated_at": "2021-01-01T00:00:00",
"tag_set": "tag1,tag2",
"chunk_html": "<p>Hello, world!</p>",
"metadata": {"key": "value"},
"tracking_id": "e3e3e3e3-e3e3-e3e3-e3e3-e3e3e3e3e3e3",
"time_stamp": "2021-01-01T00:00:00",
"weight": 0.5,
"file_id": "e3e3e3e3-e3e3-e3e3-e3e3-e3e3e3e3e3e3",
"file_name": "file.txt",
}))]
pub struct ChunkMetadataWithFileData {
pub id: uuid::Uuid,
pub content: String,
pub chunk_html: Option<String>,
pub link: Option<String>,
pub qdrant_point_id: uuid::Uuid,
pub created_at: chrono::NaiveDateTime,
pub updated_at: chrono::NaiveDateTime,
pub tag_set: Option<String>,
pub file_id: Option<uuid::Uuid>,
pub file_name: Option<String>,
pub metadata: Option<serde_json::Value>,
pub tracking_id: Option<String>,
pub time_stamp: Option<NaiveDateTime>,
pub weight: f64,
}

#[derive(Debug, Serialize, Deserialize, Clone, Queryable, ToSchema)]
#[schema(example = json!({
"id": "e3e3e3e3-e3e3-e3e3-e3e3-e3e3e3e3e3e3",
Expand Down Expand Up @@ -425,23 +391,6 @@ impl From<ChunkMetadata> for SlimChunkMetadata {
}
}

impl From<ChunkMetadataWithFileData> for SlimChunkMetadata {
fn from(chunk: ChunkMetadataWithFileData) -> Self {
SlimChunkMetadata {
id: chunk.id,
link: chunk.link,
qdrant_point_id: Some(chunk.qdrant_point_id),
created_at: chunk.created_at,
updated_at: chunk.updated_at,
tag_set: chunk.tag_set,
metadata: chunk.metadata,
tracking_id: chunk.tracking_id,
time_stamp: chunk.time_stamp,
weight: chunk.weight,
}
}
}

#[derive(Debug, Serialize, Deserialize, Clone, Queryable, ToSchema)]
#[schema(
example = json!({
Expand Down Expand Up @@ -759,7 +708,7 @@ pub struct UserDTOWithChunks {
pub email: Option<String>,
pub created_at: chrono::NaiveDateTime,
pub total_chunks_created: i64,
pub chunks: Vec<ChunkMetadataWithFileData>,
pub chunks: Vec<ChunkMetadata>,
}

#[derive(Debug, Serialize, Deserialize, Clone, Queryable, Default)]
Expand Down
6 changes: 3 additions & 3 deletions server/src/handlers/chunk_handler.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use super::auth_handler::{AdminOnly, LoggedUser};
use crate::data::models::{
ChatMessageProxy, ChunkMetadata, ChunkMetadataWithFileData, DatasetAndOrgWithSubAndPlan,
ChatMessageProxy, ChunkMetadata, DatasetAndOrgWithSubAndPlan,
IngestSpecificChunkMetadata, Pool, RedisPool, ScoreSlimChunks,
SearchSlimChunkQueryResponseBody, ServerDatasetConfiguration, SlimChunkMetadata, UnifiedId,
};
Expand Down Expand Up @@ -1185,14 +1185,14 @@ pub struct SearchChunkData {
"score": 0.5
}))]
pub struct ScoreChunkDTO {
pub metadata: Vec<ChunkMetadataWithFileData>,
pub metadata: Vec<ChunkMetadata>,
pub score: f64,
}

#[derive(Serialize, Deserialize, Debug, ToSchema, Clone)]
pub enum ChunkMetadataTypes {
IDs(Vec<SlimChunkMetadata>),
MetadataWithFileData(Vec<ChunkMetadataWithFileData>),
MetadataWithFileData(Vec<ChunkMetadata>),
}

#[derive(Serialize, Deserialize, ToSchema, Debug)]
Expand Down
4 changes: 2 additions & 2 deletions server/src/handlers/group_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use super::{
};
use crate::{
data::models::{
ChunkGroup, ChunkGroupAndFile, ChunkGroupBookmark, ChunkMetadataWithFileData,
ChunkGroup, ChunkGroupAndFile, ChunkGroupBookmark, ChunkMetadata,
DatasetAndOrgWithSubAndPlan, GroupSlimChunksDTO, Pool, ScoreSlimChunks,
SearchGroupSlimChunksResult, SearchOverGroupsSlimChunksResponseBody,
ServerDatasetConfiguration, UnifiedId,
Expand Down Expand Up @@ -656,7 +656,7 @@ pub async fn add_chunk_to_group_by_tracking_id(

#[derive(Deserialize, Serialize, Debug, ToSchema)]
pub struct BookmarkData {
pub chunks: Vec<ChunkMetadataWithFileData>,
pub chunks: Vec<ChunkMetadata>,
pub group: ChunkGroup,
pub total_pages: i64,
}
Expand Down
6 changes: 3 additions & 3 deletions server/src/handlers/message_handler.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use super::{auth_handler::LoggedUser, chunk_handler::ParsedQuery};
use crate::{
data::models::{self, DatasetAndOrgWithSubAndPlan, ServerDatasetConfiguration},
data::models::{ChunkMetadataWithFileData, Dataset, Pool},
data::models::{ChunkMetadata, Dataset, Pool},
errors::ServiceError,
get_env,
operators::{
Expand Down Expand Up @@ -691,7 +691,7 @@ pub async fn stream_response(
)
.await?;

let citation_chunks: Vec<ChunkMetadataWithFileData> = metadata_chunks.to_vec();
let citation_chunks: Vec<ChunkMetadata> = metadata_chunks.to_vec();

let highlighted_citation_chunks = if highlight_citations.unwrap_or(true) {
citation_chunks
Expand All @@ -711,7 +711,7 @@ pub async fn stream_response(
)
.unwrap_or(chunk.clone())
})
.collect::<Vec<ChunkMetadataWithFileData>>()
.collect::<Vec<ChunkMetadata>>()
} else {
citation_chunks.clone()
};
Expand Down
1 change: 0 additions & 1 deletion server/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,6 @@ impl Modify for SecurityAddon {
data::models::Topic,
data::models::Message,
data::models::ChunkMetadata,
data::models::ChunkMetadataWithFileData,
data::models::ChatMessageProxy,
data::models::Event,
data::models::SlimGroup,
Expand Down
96 changes: 18 additions & 78 deletions server/src/operators/chunk_operator.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::data::models::{
ChunkCollision, ChunkFile, ChunkFileWithName, ChunkGroupBookmark, ChunkMetadataWithFileData,
ChunkCollision, ChunkFile, ChunkGroupBookmark,
Dataset, FullTextSearchResult, ServerDatasetConfiguration, UnifiedId,
};
use crate::operators::model_operator::create_embeddings;
Expand All @@ -22,31 +22,20 @@ use simsearch::SimSearch;
pub async fn get_metadata_from_point_ids(
point_ids: Vec<uuid::Uuid>,
pool: web::Data<Pool>,
) -> Result<Vec<ChunkMetadataWithFileData>, ServiceError> {
) -> Result<Vec<ChunkMetadata>, ServiceError> {
use crate::data::schema::chunk_metadata::dsl as chunk_metadata_columns;

let mut conn = pool
.get()
.await
.expect("Failed to get connection from pool");

let chunk_metadata: Vec<ChunkMetadata> = chunk_metadata_columns::chunk_metadata
chunk_metadata_columns::chunk_metadata
.filter(chunk_metadata_columns::qdrant_point_id.eq_any(&point_ids))
.select(ChunkMetadata::as_select())
.load::<ChunkMetadata>(&mut conn)
.await
.map_err(|_| ServiceError::BadRequest("Failed to load metadata".to_string()))?;

let converted_chunks: Vec<FullTextSearchResult> = chunk_metadata
.iter()
.map(|chunk| <ChunkMetadata as Into<FullTextSearchResult>>::into(chunk.clone()))
.collect::<Vec<FullTextSearchResult>>();

let chunk_metadata_with_file_id = get_metadata_query(converted_chunks, pool)
.await
.map_err(|_| ServiceError::BadRequest("Failed to load metadata".to_string()))?;

Ok(chunk_metadata_with_file_id)
.map_err(|_| ServiceError::BadRequest("Failed to load metadata".to_string()))
}

pub async fn get_point_ids_from_unified_chunk_ids(
Expand Down Expand Up @@ -99,7 +88,7 @@ pub async fn get_point_ids_from_unified_chunk_ids(
}

pub struct ChunkMetadataWithQdrantId {
pub metadata: ChunkMetadataWithFileData,
pub metadata: ChunkMetadata,
pub qdrant_id: uuid::Uuid,
}

Expand All @@ -110,7 +99,7 @@ pub async fn get_metadata_and_collided_chunks_from_point_ids_query(
pool: web::Data<Pool>,
) -> Result<
(
Vec<ChunkMetadataWithFileData>,
Vec<ChunkMetadata>,
Vec<ChunkMetadataWithQdrantId>,
),
ServiceError,
Expand Down Expand Up @@ -146,56 +135,14 @@ pub async fn get_metadata_and_collided_chunks_from_point_ids_query(
// Fetch the chunk metadata for root cards
let chunk_search_result = {
let mut conn = pool.get().await.unwrap();
let chunk_metadata = chunk_metadata_columns::chunk_metadata
.left_outer_join(
chunk_files_columns::chunk_files
.on(chunk_metadata_columns::id.eq(chunk_files_columns::chunk_id)),
)
.left_outer_join(
files_columns::files.on(chunk_files_columns::file_id.eq(files_columns::id)),
)
.left_outer_join(
chunk_collisions_columns::chunk_collisions
.on(chunk_metadata_columns::id.eq(chunk_collisions_columns::chunk_id)),
)
.select((
chunk_metadata_columns::chunk_metadata
.select(
ChunkMetadata::as_select(),
(chunk_collisions_columns::collision_qdrant_id).nullable(),
(
chunk_files_columns::chunk_id,
chunk_files_columns::file_id,
files_columns::file_name,
)
.nullable(),
))
)
.filter(chunk_metadata_columns::qdrant_point_id.eq_any(&point_ids))
.load::<(ChunkMetadata, Option<uuid::Uuid>, Option<ChunkFileWithName>)>(&mut conn)
.load::<ChunkMetadata>(&mut conn)
.await
.map_err(|_| ServiceError::BadRequest("Failed to load metadata".to_string()))?;

chunk_metadata
.iter()
.map(|chunk| ChunkMetadataWithFileData {
id: chunk.0.id,
content: chunk.0.content.clone(),
link: chunk.0.link.clone(),
tag_set: chunk.0.tag_set.clone(),
qdrant_point_id: chunk.0.qdrant_point_id.unwrap_or_else(|| {
chunk
.1
.expect("Must have qdrant_id from collision or metadata")
}),
created_at: chunk.0.created_at,
updated_at: chunk.0.updated_at,
chunk_html: chunk.0.chunk_html.clone(),
file_id: chunk.2.clone().map(|file| file.file_id),
file_name: chunk.2.clone().map(|file| file.file_name.to_string()),
metadata: chunk.0.metadata.clone(),
tracking_id: chunk.0.tracking_id.clone(),
time_stamp: chunk.0.time_stamp,
weight: chunk.0.weight,
})
.collect::<Vec<ChunkMetadataWithFileData>>()
.map_err(|_| ServiceError::BadRequest("Failed to load metadata".to_string()))?
};

chunk_search_span.finish();
Expand Down Expand Up @@ -225,35 +172,28 @@ pub async fn get_metadata_and_collided_chunks_from_point_ids_query(
.select((
ChunkMetadata::as_select(),
chunk_collisions_columns::collision_qdrant_id.assume_not_null(),
(
chunk_files_columns::chunk_id,
chunk_files_columns::file_id,
files_columns::file_name,
)
.nullable(),
))
.load::<(ChunkMetadata, uuid::Uuid, Option<ChunkFileWithName>)>(&mut conn)
.load::<(ChunkMetadata, uuid::Uuid)>(&mut conn)
.await
.map_err(|_| ServiceError::BadRequest("Failed to load metadata".to_string()))?;

// Convert the collided chunks into the appropriate format
chunk_metadata
.iter()
.map(|chunk| {
let chunk_metadata = ChunkMetadataWithFileData {
let chunk_metadata = ChunkMetadata {
id: chunk.0.id,
content: chunk.0.content.clone(),
link: chunk.0.link.clone(),
tag_set: chunk.0.tag_set.clone(),
qdrant_point_id: chunk.0.qdrant_point_id.unwrap_or(chunk.1),
qdrant_point_id: Some(chunk.0.qdrant_point_id.unwrap_or(chunk.1)),
created_at: chunk.0.created_at,
updated_at: chunk.0.updated_at,
chunk_html: chunk.0.chunk_html.clone(),
file_id: chunk.2.clone().map(|file| file.file_id),
file_name: chunk.2.clone().map(|file| file.file_name.to_string()),
metadata: chunk.0.metadata.clone(),
tracking_id: chunk.0.tracking_id.clone(),
time_stamp: chunk.0.time_stamp,
dataset_id: chunk.0.dataset_id,
weight: chunk.0.weight,
};
ChunkMetadataWithQdrantId {
Expand Down Expand Up @@ -358,7 +298,7 @@ pub async fn get_metadata_from_ids_query(
chunk_ids: Vec<uuid::Uuid>,
dataset_uuid: uuid::Uuid,
pool: web::Data<Pool>,
) -> Result<Vec<ChunkMetadataWithFileData>, ServiceError> {
) -> Result<Vec<ChunkMetadata>, ServiceError> {
use crate::data::schema::chunk_metadata::dsl as chunk_metadata_columns;

let mut conn = pool.get().await.unwrap();
Expand Down Expand Up @@ -1027,10 +967,10 @@ pub async fn get_qdrant_id_from_chunk_id_query(

#[tracing::instrument]
pub fn find_relevant_sentence(
input: ChunkMetadataWithFileData,
input: ChunkMetadata,
query: String,
split_chars: Vec<String>,
) -> Result<ChunkMetadataWithFileData, ServiceError> {
) -> Result<ChunkMetadata, ServiceError> {
let content = &input.chunk_html.clone().unwrap_or(input.content.clone());
let mut engine: SimSearch<String> = SimSearch::new();
let mut split_content = content
Expand Down
6 changes: 3 additions & 3 deletions server/src/operators/group_operator.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
use crate::{
data::models::{
ChunkGroup, ChunkMetadata, Dataset, FileGroup, Pool, ServerDatasetConfiguration, UnifiedId,
ChunkGroup, Dataset, FileGroup, Pool, ServerDatasetConfiguration, UnifiedId,
},
operators::chunk_operator::delete_chunk_metadata_query,
};
use crate::{
data::models::{
ChunkGroupAndFileWithCount, ChunkGroupBookmark, ChunkMetadataWithCount,
ChunkMetadataWithFileData, FullTextSearchResult, SlimGroup,
ChunkMetadata, FullTextSearchResult, SlimGroup,
},
errors::ServiceError,
operators::search_operator::get_metadata_query,
Expand Down Expand Up @@ -381,7 +381,7 @@ pub async fn create_chunk_bookmark_query(
Ok(qdrant_point_id)
}
pub struct GroupsBookmarkQueryResult {
pub metadata: Vec<ChunkMetadataWithFileData>,
pub metadata: Vec<ChunkMetadata>,
pub group: ChunkGroup,
pub total_pages: i64,
}
Expand Down
Loading

0 comments on commit ffac3fc

Please sign in to comment.