Skip to content

Commit

Permalink
Prefix queries working, new CLI.
Browse files Browse the repository at this point in the history
  • Loading branch information
ellenhp committed Feb 5, 2024
1 parent 49d63a5 commit 921eff3
Show file tree
Hide file tree
Showing 20 changed files with 883 additions and 202 deletions.
4 changes: 1 addition & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
[workspace]
resolver = "2"
members = [
"airmail", "airmail_common",
"airmail_parser",
"parser_demo",
"airmail", "airmail_common", "airmail_index", "airmail_parser", "parser_demo",
]
20 changes: 0 additions & 20 deletions airmail/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,12 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
bollard = "0.15.0"
clap = { version = "4.4.18", features = ["derive"] }
futures-util = "0.3.30"
geojson = "0.24.1"
levenshtein_automata = "0.2.1"
s2 = "0.0.12"
tantivy = "0.21.1"
tantivy-common = "0.6.0"
tantivy-fst = "0.4.0"
tempfile = "3.9.0"
airmail_common = { path = "../airmail_common" }
deunicode = "1.4.2"
regex = "1.10.3"
lazy_static = "1.4.0"
reqwest = { version = "0.11", features = ["json"] }
tokio = { version = "1", features = ["full"] }
serde = { version = "1.0.196", features = ["derive"] }
log = "0.4.20"
env_logger = "0.11.1"
crossbeam = { version = "0.8.4", features = ["crossbeam-channel"] }
rayon = "1.8.1"
serde_json = "1.0.113"
airmail_parser = { path = "../airmail_parser" }

[[bin]]
name = "index"

[[bin]]
name = "query"
25 changes: 0 additions & 25 deletions airmail/src/bin/query.rs

This file was deleted.

102 changes: 80 additions & 22 deletions airmail/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,24 @@ use airmail_parser::{component::QueryComponentType, query::QueryScenario};
use tantivy::{
collector::TopDocs,
directory::MmapDirectory,
query::{BooleanQuery, PhrasePrefixQuery, Query, TermQuery},
schema::{IndexRecordOption, Schema, INDEXED, STORED, TEXT},
query::{BooleanQuery, FuzzyTermQuery, PhrasePrefixQuery, Query},
schema::{FacetOptions, Schema, INDEXED, STORED, TEXT},
Term,
};

use crate::poi::AirmailPoi;

// Field name keys.
pub const FIELD_NAME: &str = "name";
pub const FIELD_SOURCE: &str = "source";
pub const FIELD_CATEGORY: &str = "category";
pub const FIELD_HOUSE_NUMBER: &str = "house_number";
pub const FIELD_ROAD: &str = "road";
pub const FIELD_UNIT: &str = "unit";
pub const FIELD_LOCALITY: &str = "locality";
pub const FIELD_REGION: &str = "region";
pub const FIELD_S2CELL: &str = "s2cell";
pub const FIELD_TAGS: &str = "tags";

pub struct AirmailIndex {
tantivy_index: tantivy::Index,
Expand All @@ -26,6 +28,8 @@ pub struct AirmailIndex {
fn query_for_terms(
field: tantivy::schema::Field,
terms: Vec<&str>,
is_prefix: bool,
distance: u8,
) -> Result<Box<dyn Query>, Box<dyn std::error::Error>> {
if terms.len() > 1 {
Ok(Box::new(PhrasePrefixQuery::new(
Expand All @@ -35,18 +39,29 @@ fn query_for_terms(
.collect(),
)))
} else {
Ok(Box::new(TermQuery::new(
Term::from_field_text(field, terms[0]),
IndexRecordOption::Basic,
)))
if is_prefix {
Ok(Box::new(FuzzyTermQuery::new_prefix(
Term::from_field_text(field, terms[0]),
distance,
true,
)))
} else {
Ok(Box::new(FuzzyTermQuery::new(
Term::from_field_text(field, terms[0]),
distance,
true,
)))
}
}
}

impl AirmailIndex {
fn schema() -> tantivy::schema::Schema {
let mut schema_builder = Schema::builder();
let _ = schema_builder.add_text_field(FIELD_NAME, TEXT | STORED);
let _ = schema_builder.add_text_field(FIELD_CATEGORY, TEXT | STORED);
let _ = schema_builder.add_text_field(FIELD_SOURCE, TEXT | STORED);
let _ =
schema_builder.add_facet_field(FIELD_CATEGORY, FacetOptions::default().set_stored());
let _ = schema_builder.add_text_field(FIELD_HOUSE_NUMBER, TEXT | STORED);
let _ = schema_builder.add_text_field(FIELD_ROAD, TEXT | STORED);
let _ = schema_builder.add_text_field(FIELD_UNIT, TEXT | STORED);
Expand Down Expand Up @@ -110,7 +125,7 @@ impl AirmailIndex {
}

pub fn writer(&mut self) -> Result<AirmailIndexWriter, Box<dyn std::error::Error>> {
let tantivy_writer = self.tantivy_index.writer(50_000_000)?;
let tantivy_writer = self.tantivy_index.writer(200_000_000)?;
let writer = AirmailIndexWriter {
tantivy_writer,
schema: self.tantivy_index.schema(),
Expand All @@ -122,8 +137,15 @@ impl AirmailIndex {
let tantivy_reader = self.tantivy_index.reader()?;
let searcher = tantivy_reader.searcher();
let mut queries: Vec<Box<dyn Query>> = Vec::new();
for component in &query.as_vec() {
let terms: Vec<&str> = component.text().split_whitespace().collect();
let query_vec = query.as_vec();
for (i, component) in query_vec.iter().enumerate() {
let is_prefix = i == query_vec.len() - 1;
let terms: Vec<String> = component
.text()
.split_whitespace()
.map(|term| term.to_lowercase())
.collect();
let term_strs = terms.iter().map(|s| s.as_str()).collect();
if terms.is_empty() {
continue;
}
Expand All @@ -137,35 +159,64 @@ impl AirmailIndex {
}

QueryComponentType::HouseNumberComponent => {
queries.push(query_for_terms(self.field_house_number(), terms)?);
queries.push(query_for_terms(
self.field_house_number(),
term_strs,
is_prefix,
0, // Never fuzzy match house numbers.
)?);
}

QueryComponentType::RoadComponent => {
queries.push(query_for_terms(self.field_road(), terms)?);
queries.push(query_for_terms(self.field_road(), term_strs, is_prefix, 1)?);
}

QueryComponentType::IntersectionComponent => {
// No-op
}

QueryComponentType::SublocalityComponent => {
// No-op
// No-op, and probably always will be. "Downtown" is very subjective, for example.
}

QueryComponentType::LocalityComponent => {
queries.push(query_for_terms(self.field_locality(), terms)?);
queries.push(query_for_terms(
self.field_locality(),
term_strs,
is_prefix,
1,
)?);
}

QueryComponentType::RegionComponent => {
queries.push(query_for_terms(self.field_region(), terms)?);
queries.push(query_for_terms(
self.field_region(),
term_strs,
is_prefix,
1,
)?);
}

QueryComponentType::CountryComponent => {
// No-op
}

QueryComponentType::PlaceNameComponent => {
// No-op
let mut new_terms = Vec::new();
for term in &terms {
if term.ends_with("'s") {
new_terms.push(term.trim_end_matches("'s"));
} else if term.ends_with("s") {
new_terms.push(term.trim_end_matches('s'));
}
}
let original = query_for_terms(self.field_name(), term_strs, is_prefix, 1)?;
if new_terms.is_empty() {
queries.push(original);
} else {
let modified = query_for_terms(self.field_name(), new_terms, is_prefix, 1)?;
queries.push(Box::new(BooleanQuery::union(vec![original, modified])));
}
}

QueryComponentType::IntersectionJoinWordComponent => {
Expand All @@ -176,8 +227,7 @@ impl AirmailIndex {

let query = BooleanQuery::intersection(queries);
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
println!("Found {} hits", top_docs.len());
let results = Vec::new();
let mut results = Vec::new();
for (_score, doc_address) in top_docs {
let doc = searcher.doc(doc_address)?;
let house_num: Vec<&str> = doc
Expand All @@ -203,11 +253,15 @@ impl AirmailIndex {
.unwrap();
let cellid = s2::cellid::CellID(s2cell);
let latlng = s2::latlng::LatLng::from(cellid);
let names = doc
.get_all(self.field_name())
.filter_map(|v| v.as_text())
.collect::<Vec<&str>>();

println!(
"house_num: {:?}, road: {:?}, unit: {:?}, locality: {:?}, latlng: {:?}",
house_num, road, unit, locality, latlng
);
results.push(format!(
"names: {:?}, house_num: {:?}, road: {:?}, unit: {:?}, locality: {:?}, latlng: {:?}",
names, house_num, road, unit, locality, latlng
));
}

Ok(results)
Expand All @@ -225,6 +279,10 @@ impl AirmailIndexWriter {
for name in poi.name {
doc.add_text(self.schema.get_field(FIELD_NAME).unwrap(), name);
}
doc.add_text(self.schema.get_field(FIELD_SOURCE).unwrap(), poi.source);
for category in poi.category {
doc.add_facet(self.schema.get_field(FIELD_CATEGORY).unwrap(), &category);
}
for house_number in poi.house_number {
doc.add_text(
self.schema.get_field(FIELD_HOUSE_NUMBER).unwrap(),
Expand Down
3 changes: 0 additions & 3 deletions airmail/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
#[macro_use]
extern crate lazy_static;

pub mod index;
pub mod parser;
pub mod poi;
51 changes: 51 additions & 0 deletions airmail/src/poi.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
use std::error::Error;

use airmail_common::categories::PoiCategory;

#[derive(Debug, Clone)]
pub struct AirmailPoi {
pub name: Vec<String>,
pub source: String,
pub category: Vec<String>,
pub house_number: Vec<String>,
pub road: Vec<String>,
pub unit: Vec<String>,
pub locality: Vec<String>,
pub region: Vec<String>,
pub country: Vec<String>,
pub s2cell: u64,
pub tags: Vec<(String, String)>,
}

impl AirmailPoi {
pub fn new(
name: Vec<String>,
source: String,
category: Vec<PoiCategory>,
house_number: Vec<String>,
road: Vec<String>,
unit: Vec<String>,
lat: f64,
lng: f64,
tags: Vec<(String, String)>,
) -> Result<Self, Box<dyn Error>> {
let s2cell = s2::cellid::CellID::from(s2::latlng::LatLng::from_degrees(lat, lng)).0;

Ok(Self {
name,
source,
category: category
.iter()
.map(|category| category.to_facet())
.collect(), // FIXME.
house_number,
road,
unit,
locality: Vec::new(),
region: Vec::new(),
country: Vec::new(),
s2cell,
tags,
})
}
}
Loading

0 comments on commit 921eff3

Please sign in to comment.