From 921eff3fe4ef67372229dddbf3c98ff79f0ccc48 Mon Sep 17 00:00:00 2001 From: Ellen Poe Date: Sun, 4 Feb 2024 18:21:35 -0800 Subject: [PATCH] Prefix queries working, new CLI. --- Cargo.toml | 4 +- airmail/Cargo.toml | 20 -- airmail/src/bin/query.rs | 25 -- airmail/src/index.rs | 102 ++++++-- airmail/src/lib.rs | 3 - airmail/src/poi.rs | 51 ++++ airmail/src/poi/mod.rs | 77 ------- airmail_common/Cargo.toml | 1 + airmail_common/src/categories.rs | 218 ++++++++++++++++-- airmail_index/Cargo.toml | 35 +++ .../permute_dicts/en/localities.txt | 0 .../permute_dicts/en/street_types.txt | 0 airmail_index/src/bin/query.rs | 49 ++++ .../bin/index.rs => airmail_index/src/main.rs | 202 ++++++++++++++-- .../src}/openaddresses.rs | 5 +- airmail_index/src/openstreetmap.rs | 185 +++++++++++++++ .../poi => airmail_index/src}/query_pip.rs | 5 +- .../src}/substitutions.rs | 3 +- airmail_parser/src/component.rs | 91 +++++++- airmail_parser/src/scorers.rs | 9 +- 20 files changed, 883 insertions(+), 202 deletions(-) delete mode 100644 airmail/src/bin/query.rs create mode 100644 airmail/src/poi.rs delete mode 100644 airmail/src/poi/mod.rs create mode 100644 airmail_index/Cargo.toml rename {airmail => airmail_index}/permute_dicts/en/localities.txt (100%) rename {airmail => airmail_index}/permute_dicts/en/street_types.txt (100%) create mode 100644 airmail_index/src/bin/query.rs rename airmail/src/bin/index.rs => airmail_index/src/main.rs (55%) rename {airmail/src/poi => airmail_index/src}/openaddresses.rs (89%) create mode 100644 airmail_index/src/openstreetmap.rs rename {airmail/src/poi => airmail_index/src}/query_pip.rs (78%) rename {airmail/src/poi => airmail_index/src}/substitutions.rs (97%) diff --git a/Cargo.toml b/Cargo.toml index e450efd..7ea261d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,5 @@ [workspace] resolver = "2" members = [ - "airmail", "airmail_common", - "airmail_parser", - "parser_demo", + "airmail", "airmail_common", "airmail_index", "airmail_parser", "parser_demo", ] diff --git a/airmail/Cargo.toml b/airmail/Cargo.toml index a72db10..21f587d 100644 --- a/airmail/Cargo.toml +++ b/airmail/Cargo.toml @@ -6,10 +6,6 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -bollard = "0.15.0" -clap = { version = "4.4.18", features = ["derive"] } -futures-util = "0.3.30" -geojson = "0.24.1" levenshtein_automata = "0.2.1" s2 = "0.0.12" tantivy = "0.21.1" @@ -17,21 +13,5 @@ tantivy-common = "0.6.0" tantivy-fst = "0.4.0" tempfile = "3.9.0" airmail_common = { path = "../airmail_common" } -deunicode = "1.4.2" -regex = "1.10.3" -lazy_static = "1.4.0" -reqwest = { version = "0.11", features = ["json"] } -tokio = { version = "1", features = ["full"] } -serde = { version = "1.0.196", features = ["derive"] } log = "0.4.20" -env_logger = "0.11.1" -crossbeam = { version = "0.8.4", features = ["crossbeam-channel"] } -rayon = "1.8.1" -serde_json = "1.0.113" airmail_parser = { path = "../airmail_parser" } - -[[bin]] -name = "index" - -[[bin]] -name = "query" diff --git a/airmail/src/bin/query.rs b/airmail/src/bin/query.rs deleted file mode 100644 index 604380b..0000000 --- a/airmail/src/bin/query.rs +++ /dev/null @@ -1,25 +0,0 @@ -use airmail::index::AirmailIndex; -use clap::Parser; - -#[derive(Debug, Parser)] -struct Args { - #[clap(long, short)] - index: String, -} - -#[tokio::main] -async fn main() -> Result<(), Box> { - let args = Args::parse(); - println!("{:?}", args); - let index = AirmailIndex::new(&args.index)?; - - let query = "425 harvard ave, seattle"; - let parsed = airmail_parser::query::Query::parse(query); - - let scenarios = parsed.scenarios(); - let top = scenarios.first().unwrap(); - - let _results = index.search(top); - - Ok(()) -} diff --git a/airmail/src/index.rs b/airmail/src/index.rs index 8d9f3d7..963f2c9 100644 --- a/airmail/src/index.rs +++ b/airmail/src/index.rs @@ -2,8 +2,8 @@ use airmail_parser::{component::QueryComponentType, query::QueryScenario}; use tantivy::{ collector::TopDocs, directory::MmapDirectory, - query::{BooleanQuery, PhrasePrefixQuery, Query, TermQuery}, - schema::{IndexRecordOption, Schema, INDEXED, STORED, TEXT}, + query::{BooleanQuery, FuzzyTermQuery, PhrasePrefixQuery, Query}, + schema::{FacetOptions, Schema, INDEXED, STORED, TEXT}, Term, }; @@ -11,6 +11,7 @@ use crate::poi::AirmailPoi; // Field name keys. pub const FIELD_NAME: &str = "name"; +pub const FIELD_SOURCE: &str = "source"; pub const FIELD_CATEGORY: &str = "category"; pub const FIELD_HOUSE_NUMBER: &str = "house_number"; pub const FIELD_ROAD: &str = "road"; @@ -18,6 +19,7 @@ pub const FIELD_UNIT: &str = "unit"; pub const FIELD_LOCALITY: &str = "locality"; pub const FIELD_REGION: &str = "region"; pub const FIELD_S2CELL: &str = "s2cell"; +pub const FIELD_TAGS: &str = "tags"; pub struct AirmailIndex { tantivy_index: tantivy::Index, @@ -26,6 +28,8 @@ pub struct AirmailIndex { fn query_for_terms( field: tantivy::schema::Field, terms: Vec<&str>, + is_prefix: bool, + distance: u8, ) -> Result, Box> { if terms.len() > 1 { Ok(Box::new(PhrasePrefixQuery::new( @@ -35,10 +39,19 @@ fn query_for_terms( .collect(), ))) } else { - Ok(Box::new(TermQuery::new( - Term::from_field_text(field, terms[0]), - IndexRecordOption::Basic, - ))) + if is_prefix { + Ok(Box::new(FuzzyTermQuery::new_prefix( + Term::from_field_text(field, terms[0]), + distance, + true, + ))) + } else { + Ok(Box::new(FuzzyTermQuery::new( + Term::from_field_text(field, terms[0]), + distance, + true, + ))) + } } } @@ -46,7 +59,9 @@ impl AirmailIndex { fn schema() -> tantivy::schema::Schema { let mut schema_builder = Schema::builder(); let _ = schema_builder.add_text_field(FIELD_NAME, TEXT | STORED); - let _ = schema_builder.add_text_field(FIELD_CATEGORY, TEXT | STORED); + let _ = schema_builder.add_text_field(FIELD_SOURCE, TEXT | STORED); + let _ = + schema_builder.add_facet_field(FIELD_CATEGORY, FacetOptions::default().set_stored()); let _ = schema_builder.add_text_field(FIELD_HOUSE_NUMBER, TEXT | STORED); let _ = schema_builder.add_text_field(FIELD_ROAD, TEXT | STORED); let _ = schema_builder.add_text_field(FIELD_UNIT, TEXT | STORED); @@ -110,7 +125,7 @@ impl AirmailIndex { } pub fn writer(&mut self) -> Result> { - let tantivy_writer = self.tantivy_index.writer(50_000_000)?; + let tantivy_writer = self.tantivy_index.writer(200_000_000)?; let writer = AirmailIndexWriter { tantivy_writer, schema: self.tantivy_index.schema(), @@ -122,8 +137,15 @@ impl AirmailIndex { let tantivy_reader = self.tantivy_index.reader()?; let searcher = tantivy_reader.searcher(); let mut queries: Vec> = Vec::new(); - for component in &query.as_vec() { - let terms: Vec<&str> = component.text().split_whitespace().collect(); + let query_vec = query.as_vec(); + for (i, component) in query_vec.iter().enumerate() { + let is_prefix = i == query_vec.len() - 1; + let terms: Vec = component + .text() + .split_whitespace() + .map(|term| term.to_lowercase()) + .collect(); + let term_strs = terms.iter().map(|s| s.as_str()).collect(); if terms.is_empty() { continue; } @@ -137,11 +159,16 @@ impl AirmailIndex { } QueryComponentType::HouseNumberComponent => { - queries.push(query_for_terms(self.field_house_number(), terms)?); + queries.push(query_for_terms( + self.field_house_number(), + term_strs, + is_prefix, + 0, // Never fuzzy match house numbers. + )?); } QueryComponentType::RoadComponent => { - queries.push(query_for_terms(self.field_road(), terms)?); + queries.push(query_for_terms(self.field_road(), term_strs, is_prefix, 1)?); } QueryComponentType::IntersectionComponent => { @@ -149,15 +176,25 @@ impl AirmailIndex { } QueryComponentType::SublocalityComponent => { - // No-op + // No-op, and probably always will be. "Downtown" is very subjective, for example. } QueryComponentType::LocalityComponent => { - queries.push(query_for_terms(self.field_locality(), terms)?); + queries.push(query_for_terms( + self.field_locality(), + term_strs, + is_prefix, + 1, + )?); } QueryComponentType::RegionComponent => { - queries.push(query_for_terms(self.field_region(), terms)?); + queries.push(query_for_terms( + self.field_region(), + term_strs, + is_prefix, + 1, + )?); } QueryComponentType::CountryComponent => { @@ -165,7 +202,21 @@ impl AirmailIndex { } QueryComponentType::PlaceNameComponent => { - // No-op + let mut new_terms = Vec::new(); + for term in &terms { + if term.ends_with("'s") { + new_terms.push(term.trim_end_matches("'s")); + } else if term.ends_with("s") { + new_terms.push(term.trim_end_matches('s')); + } + } + let original = query_for_terms(self.field_name(), term_strs, is_prefix, 1)?; + if new_terms.is_empty() { + queries.push(original); + } else { + let modified = query_for_terms(self.field_name(), new_terms, is_prefix, 1)?; + queries.push(Box::new(BooleanQuery::union(vec![original, modified]))); + } } QueryComponentType::IntersectionJoinWordComponent => { @@ -176,8 +227,7 @@ impl AirmailIndex { let query = BooleanQuery::intersection(queries); let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; - println!("Found {} hits", top_docs.len()); - let results = Vec::new(); + let mut results = Vec::new(); for (_score, doc_address) in top_docs { let doc = searcher.doc(doc_address)?; let house_num: Vec<&str> = doc @@ -203,11 +253,15 @@ impl AirmailIndex { .unwrap(); let cellid = s2::cellid::CellID(s2cell); let latlng = s2::latlng::LatLng::from(cellid); + let names = doc + .get_all(self.field_name()) + .filter_map(|v| v.as_text()) + .collect::>(); - println!( - "house_num: {:?}, road: {:?}, unit: {:?}, locality: {:?}, latlng: {:?}", - house_num, road, unit, locality, latlng - ); + results.push(format!( + "names: {:?}, house_num: {:?}, road: {:?}, unit: {:?}, locality: {:?}, latlng: {:?}", + names, house_num, road, unit, locality, latlng + )); } Ok(results) @@ -225,6 +279,10 @@ impl AirmailIndexWriter { for name in poi.name { doc.add_text(self.schema.get_field(FIELD_NAME).unwrap(), name); } + doc.add_text(self.schema.get_field(FIELD_SOURCE).unwrap(), poi.source); + for category in poi.category { + doc.add_facet(self.schema.get_field(FIELD_CATEGORY).unwrap(), &category); + } for house_number in poi.house_number { doc.add_text( self.schema.get_field(FIELD_HOUSE_NUMBER).unwrap(), diff --git a/airmail/src/lib.rs b/airmail/src/lib.rs index df72a76..fb89515 100644 --- a/airmail/src/lib.rs +++ b/airmail/src/lib.rs @@ -1,6 +1,3 @@ -#[macro_use] -extern crate lazy_static; - pub mod index; pub mod parser; pub mod poi; diff --git a/airmail/src/poi.rs b/airmail/src/poi.rs new file mode 100644 index 0000000..0c03fa3 --- /dev/null +++ b/airmail/src/poi.rs @@ -0,0 +1,51 @@ +use std::error::Error; + +use airmail_common::categories::PoiCategory; + +#[derive(Debug, Clone)] +pub struct AirmailPoi { + pub name: Vec, + pub source: String, + pub category: Vec, + pub house_number: Vec, + pub road: Vec, + pub unit: Vec, + pub locality: Vec, + pub region: Vec, + pub country: Vec, + pub s2cell: u64, + pub tags: Vec<(String, String)>, +} + +impl AirmailPoi { + pub fn new( + name: Vec, + source: String, + category: Vec, + house_number: Vec, + road: Vec, + unit: Vec, + lat: f64, + lng: f64, + tags: Vec<(String, String)>, + ) -> Result> { + let s2cell = s2::cellid::CellID::from(s2::latlng::LatLng::from_degrees(lat, lng)).0; + + Ok(Self { + name, + source, + category: category + .iter() + .map(|category| category.to_facet()) + .collect(), // FIXME. + house_number, + road, + unit, + locality: Vec::new(), + region: Vec::new(), + country: Vec::new(), + s2cell, + tags, + }) + } +} diff --git a/airmail/src/poi/mod.rs b/airmail/src/poi/mod.rs deleted file mode 100644 index 69f56c0..0000000 --- a/airmail/src/poi/mod.rs +++ /dev/null @@ -1,77 +0,0 @@ -use std::error::Error; - -use airmail_common::categories::PoiCategory; -use s2::{cellid::CellID, latlng::LatLng}; - -pub mod openaddresses; -pub mod query_pip; -pub mod substitutions; - -#[derive(Debug, Clone)] -pub struct AirmailPoi { - pub name: Vec, - pub category: Vec, - pub house_number: Vec, - pub road: Vec, - pub unit: Vec, - pub locality: Vec, - pub region: Vec, - pub country: Vec, - pub s2cell: u64, -} - -impl AirmailPoi { - pub fn new( - name: Vec, - category: Vec, - house_number: Vec, - road: Vec, - unit: Vec, - lat: f64, - lng: f64, - ) -> Result> { - let s2cell = s2::cellid::CellID::from(s2::latlng::LatLng::from_degrees(lat, lng)).0; - - Ok(Self { - name, - category: vec![], // FIXME. - house_number, - road, - unit, - locality: Vec::new(), - region: Vec::new(), - country: Vec::new(), - s2cell, - }) - } - - pub async fn populate_admin_areas(&mut self) -> Result<(), Box> { - let cell = CellID(self.s2cell); - let latlng = LatLng::from(cell); - let pip_response = query_pip::query_pip(latlng.lat.deg(), latlng.lng.deg()).await?; - let locality = pip_response - .locality - .unwrap_or_default() - .iter() - .map(|a| a.name.to_lowercase()) - .collect(); - let region = pip_response - .region - .unwrap_or_default() - .iter() - .map(|a| a.name.to_lowercase()) - .collect(); - let country = pip_response - .country - .unwrap_or_default() - .iter() - .map(|a| a.name.to_lowercase()) - .collect(); - - self.locality = locality; - self.region = region; - self.country = country; - - Ok(()) - } -} diff --git a/airmail_common/Cargo.toml b/airmail_common/Cargo.toml index ed0eef5..0f8ff2a 100644 --- a/airmail_common/Cargo.toml +++ b/airmail_common/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" cached = "0.48.1" fst = { version = "0.4.7", features = ["levenshtein"] } lazy_static = "1.4.0" +deunicode = "1.4.2" [build-dependencies] deunicode = "1.4.2" diff --git a/airmail_common/src/categories.rs b/airmail_common/src/categories.rs index 35be712..cf9373a 100644 --- a/airmail_common/src/categories.rs +++ b/airmail_common/src/categories.rs @@ -1,3 +1,30 @@ +#[derive(Debug, Clone, PartialEq, PartialOrd)] +pub enum AmenityPoiCategory { + /// A public toilet or shower. + Toilets, + /// A public shelter, e.g. a bus shelter or a picnic shelter. + Shelter, + /// Public water source. + DrinkingWater, + /// A public telephone. + Telephone, + /// A public library. + Library, +} + +impl AmenityPoiCategory { + pub fn to_facet(&self) -> String { + match self { + AmenityPoiCategory::Toilets => "toilets".to_string(), + AmenityPoiCategory::Shelter => "shelter".to_string(), + AmenityPoiCategory::DrinkingWater => "drinking_water".to_string(), + AmenityPoiCategory::Telephone => "telephone".to_string(), + AmenityPoiCategory::Library => "library".to_string(), + } + } +} + +#[derive(Debug, Clone, PartialEq, PartialOrd)] pub enum NaturalPoiCategory { /// A mountain, hill, or other point of elevation. Peak, @@ -9,6 +36,20 @@ pub enum NaturalPoiCategory { Other { raw_tag: String }, } +impl NaturalPoiCategory { + pub fn to_facet(&self) -> String { + match self { + NaturalPoiCategory::Peak => "peak".to_string(), + NaturalPoiCategory::Water => "water".to_string(), + NaturalPoiCategory::Wood => "wood".to_string(), + NaturalPoiCategory::Other { raw_tag } => { + format!("other/{}", deunicode::deunicode(raw_tag)) + } + } + } +} + +#[derive(Debug, Clone, PartialEq, PartialOrd)] pub enum TransitPoiCategory { /// A bus stop. BusStop, @@ -26,6 +67,74 @@ pub enum TransitPoiCategory { Other { raw_tag: String }, } +impl TransitPoiCategory { + pub fn to_facet(&self) -> String { + match self { + TransitPoiCategory::BusStop => "bus_stop".to_string(), + TransitPoiCategory::TrainStation => "train_station".to_string(), + TransitPoiCategory::Airport => "airport".to_string(), + TransitPoiCategory::FerryTerminal => "ferry_terminal".to_string(), + TransitPoiCategory::SubwayStation => "subway_station".to_string(), + TransitPoiCategory::TramStop => "tram_stop".to_string(), + TransitPoiCategory::Other { raw_tag } => { + format!("other/{}", deunicode::deunicode(raw_tag)) + } + } + } +} + +#[derive(Debug, Clone, PartialEq, PartialOrd)] +pub enum CuisineCategory { + /// African cuisine. + African, + /// American cuisine. + American, + /// Asian cuisine. + Asian, + /// European cuisine. + European, + /// Middle Eastern cuisine. + MiddleEastern, + /// Other cuisine. + Other { raw_tag: String }, +} + +impl CuisineCategory { + pub fn to_facet(&self) -> String { + match self { + CuisineCategory::African => "african".to_string(), + CuisineCategory::American => "american".to_string(), + CuisineCategory::Asian => "asian".to_string(), + CuisineCategory::European => "european".to_string(), + CuisineCategory::MiddleEastern => "middle_eastern".to_string(), + CuisineCategory::Other { raw_tag } => { + format!("other/{}", deunicode::deunicode(raw_tag)) + } + } + } +} + +#[derive(Debug, Clone, PartialEq, PartialOrd)] +pub enum EmergencyPoiCategory { + /// A fire station. + FireStation, + /// A hospital. + Hospital, + /// A police station. + PoliceStation, +} + +impl EmergencyPoiCategory { + pub fn to_facet(&self) -> String { + match self { + EmergencyPoiCategory::FireStation => "fire_station".to_string(), + EmergencyPoiCategory::Hospital => "hospital".to_string(), + EmergencyPoiCategory::PoliceStation => "police_station".to_string(), + } + } +} + +#[derive(Debug, Clone, PartialEq, PartialOrd)] pub enum FoodPoiCategory { /// A place to buy baked goods. Bakery, @@ -34,16 +143,42 @@ pub enum FoodPoiCategory { /// A place to buy groceries. Grocery, /// A restaurant or cafe. - Restaurant, + Restaurant(Option), /// Other food shop. Other { raw_tag: String }, } +impl FoodPoiCategory { + pub fn to_facet(&self) -> String { + match self { + FoodPoiCategory::Bakery => "bakery".to_string(), + FoodPoiCategory::Beverage => "beverage".to_string(), + FoodPoiCategory::Grocery => "grocery".to_string(), + FoodPoiCategory::Restaurant(Some(cuisine)) => { + format!("restaurant/{}", cuisine.to_facet()) + } + FoodPoiCategory::Restaurant(None) => "restaurant".to_string(), + FoodPoiCategory::Other { raw_tag } => { + format!("other/{}", deunicode::deunicode(raw_tag)) + } + } + } +} + +#[derive(Debug, Clone, PartialEq, PartialOrd)] pub enum ShopPoiCategory { + /// An adult store, e.g. a sex shop, strip club or bathhouse. + Adult, /// A place to buy art. Art, + /// A bank or ATM. + Bank, + /// A bar. + Bar, /// A place to buy books. Books, + /// A doctor's office. + Clinic, /// A place to buy clothes. Clothes, /// A place to buy convenience goods. @@ -52,30 +187,26 @@ pub enum ShopPoiCategory { Electronics, /// A place to buy flowers. Florist, - /// A place to buy food. - Food(), + /// A place to buy food, including restaurants and grocery stores. + Food(FoodPoiCategory), /// A place to buy furniture. Furniture, /// A place to buy garden supplies. - GardenCentre, - /// A place to buy gifts. Gift, - /// A place to buy hardware. + /// A hardware store, garden store, or big-box home improvement retailer. Hardware, /// A place to buy health supplies. Health, /// A place to buy jewelry. Jewelry, - /// A place to buy laundry supplies. - Laundry, - /// A place to buy liquor. + /// A place to buy liquor (not a bar). Liquor, /// A place to buy music. Music, - /// A place to buy news. - Newsagent, /// A place to buy pet supplies. Pet, + /// A pharmacy. + Pharmacy, /// A place to buy photo supplies. Photo, /// A place to buy shoes. @@ -86,19 +217,61 @@ pub enum ShopPoiCategory { Tobacco, /// A place to buy toys. Toys, + /// A veterinarian's office. + Veterinary, /// A place to buy video games. - Video, + VideoGame, /// Other shop. Other { raw_tag: String }, } +impl ShopPoiCategory { + pub fn to_facet(&self) -> String { + match self { + ShopPoiCategory::Adult => "adult".to_string(), + ShopPoiCategory::Art => "art".to_string(), + ShopPoiCategory::Bank => "bank".to_string(), + ShopPoiCategory::Bar => "bar".to_string(), + ShopPoiCategory::Books => "books".to_string(), + ShopPoiCategory::Clothes => "clothes".to_string(), + ShopPoiCategory::Clinic => "clinic".to_string(), + ShopPoiCategory::Convenience => "convenience".to_string(), + ShopPoiCategory::Electronics => "electronics".to_string(), + ShopPoiCategory::Florist => "florist".to_string(), + ShopPoiCategory::Food(food) => format!("food/{}", food.to_facet()), + ShopPoiCategory::Furniture => "furniture".to_string(), + ShopPoiCategory::Gift => "gift".to_string(), + ShopPoiCategory::Hardware => "hardware".to_string(), + ShopPoiCategory::Health => "health".to_string(), + ShopPoiCategory::Jewelry => "jewelry".to_string(), + ShopPoiCategory::Liquor => "liquor".to_string(), + ShopPoiCategory::Music => "music".to_string(), + ShopPoiCategory::Pet => "pet".to_string(), + ShopPoiCategory::Pharmacy => "pharmacy".to_string(), + ShopPoiCategory::Photo => "photo".to_string(), + ShopPoiCategory::Shoes => "shoes".to_string(), + ShopPoiCategory::Sports => "sports".to_string(), + ShopPoiCategory::Tobacco => "tobacco".to_string(), + ShopPoiCategory::Toys => "toys".to_string(), + ShopPoiCategory::Veterinary => "veterinary".to_string(), + ShopPoiCategory::VideoGame => "video_game".to_string(), + ShopPoiCategory::Other { raw_tag } => { + format!("other/{}", deunicode::deunicode(raw_tag)) + } + } + } +} + +#[derive(Debug, Clone, PartialEq, PartialOrd)] pub enum PoiCategory { /// An address without additional information, e.g. from OpenAddresses or an untagged OSM node. Address, /// An administrative area, e.g. a country, state, or city. AdminArea, + /// Amenities for public use, e.g. public toilets, drinking fountains. + Amenity(AmenityPoiCategory), /// A place to go in an emergency, e.g. a fire station or hospital. - Emergency, + Emergency(EmergencyPoiCategory), /// A road or path. Highway, /// Land use, e.g. a park or a school. @@ -116,3 +289,22 @@ pub enum PoiCategory { /// A tourist attraction, e.g. a museum or viewpoint. Tourism, } + +impl PoiCategory { + pub fn to_facet(&self) -> String { + match self { + PoiCategory::Address => "/address".to_string(), + PoiCategory::AdminArea => "/admin_area".to_string(), + PoiCategory::Amenity(amenity) => format!("/amenity/{}", amenity.to_facet()), + PoiCategory::Emergency(emergency) => format!("/emergency/{}", emergency.to_facet()), + PoiCategory::Highway => "/highway".to_string(), + PoiCategory::Landuse => "/landuse".to_string(), + PoiCategory::Leisure => "/leisure".to_string(), + PoiCategory::Natural(natural) => format!("/natural/{}", natural.to_facet()), + PoiCategory::Transit(transit) => format!("/transit/{}", transit.to_facet()), + PoiCategory::Shop(shop) => format!("/shop/{}", shop.to_facet()), + PoiCategory::Sport => "/sport".to_string(), + PoiCategory::Tourism => "/tourism".to_string(), + } + } +} diff --git a/airmail_index/Cargo.toml b/airmail_index/Cargo.toml new file mode 100644 index 0000000..a1fe8b6 --- /dev/null +++ b/airmail_index/Cargo.toml @@ -0,0 +1,35 @@ +[package] +name = "airmail_index" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +airmail = { path = "../airmail" } +reqwest = { version = "0.11", features = ["json"] } +tokio = { version = "1", features = ["full"] } +crossbeam = { version = "0.8.4", features = ["crossbeam-channel"] } +rayon = "1.8.1" +serde_json = "1.0.113" +s2 = "0.0.12" +geojson = "0.24.1" +serde = { version = "1.0.196", features = ["derive"] } +airmail_common = { path = "../airmail_common" } +lazy_static = "1.4.0" +bollard = "0.15.0" +clap = { version = "4.4.18", features = ["derive"] } +futures-util = "0.3.30" +env_logger = "0.11.1" +airmail_parser = { path = "../airmail_parser" } +regex = "1.10.3" +deunicode = "1.4.2" +log = "0.4.20" +geo = "0.27.0" +osmpbf = "0.3.3" +rand = "0.8.5" +subprocess = "0.2.9" +rustyline = "13.0.0" + +[[bin]] +name = "query" diff --git a/airmail/permute_dicts/en/localities.txt b/airmail_index/permute_dicts/en/localities.txt similarity index 100% rename from airmail/permute_dicts/en/localities.txt rename to airmail_index/permute_dicts/en/localities.txt diff --git a/airmail/permute_dicts/en/street_types.txt b/airmail_index/permute_dicts/en/street_types.txt similarity index 100% rename from airmail/permute_dicts/en/street_types.txt rename to airmail_index/permute_dicts/en/street_types.txt diff --git a/airmail_index/src/bin/query.rs b/airmail_index/src/bin/query.rs new file mode 100644 index 0000000..14ce01a --- /dev/null +++ b/airmail_index/src/bin/query.rs @@ -0,0 +1,49 @@ +use airmail::index::AirmailIndex; +use clap::Parser; +use rustyline::DefaultEditor; + +#[derive(Debug, Parser)] +struct Args { + #[clap(long, short)] + index: String, +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let args = Args::parse(); + println!("{:?}", args); + let index = AirmailIndex::new(&args.index)?; + let mut rl = DefaultEditor::new()?; + loop { + let query = rl.readline("query: ")?; + rl.add_history_entry(query.as_str())?; + let start = std::time::Instant::now(); + let query = query.trim().to_lowercase(); + let parsed = airmail_parser::query::Query::parse(&query); + + let scenarios = parsed.scenarios(); + let results: Option> = scenarios + .iter() + .take(10) + .filter_map(|scenario| { + let results = index.search(scenario).unwrap(); + if results.is_empty() { + None + } else { + Some(results) + } + }) + .next(); + + println!(); + if let Some(results) = results { + for result in &results { + println!(" - {}", result); + } + println!("{} results found in {:?}", results.len(), start.elapsed()); + } else { + println!("No results found in {:?}.", start.elapsed()); + } + println!(); + } +} diff --git a/airmail/src/bin/index.rs b/airmail_index/src/main.rs similarity index 55% rename from airmail/src/bin/index.rs rename to airmail_index/src/main.rs index 69f81aa..7cae40c 100644 --- a/airmail/src/bin/index.rs +++ b/airmail_index/src/main.rs @@ -1,6 +1,12 @@ -use std::{collections::HashMap, str::FromStr, sync::Arc}; +#[macro_use] +extern crate lazy_static; -use airmail::poi::{openaddresses::parse_oa_geojson, AirmailPoi}; +pub mod openaddresses; +pub mod openstreetmap; +pub mod query_pip; +pub mod substitutions; + +use airmail::poi::AirmailPoi; use bollard::{ container::{ CreateContainerOptions, ListContainersOptions, RemoveContainerOptions, @@ -14,6 +20,8 @@ use clap::Parser; use crossbeam::channel::{Receiver, Sender}; use futures_util::TryStreamExt; use geojson::GeoJson; +use s2::{cellid::CellID, latlng::LatLng}; +use std::{collections::HashMap, env::temp_dir, error::Error, str::FromStr, sync::Arc}; use tokio::{ io::{AsyncBufReadExt, BufReader}, spawn, @@ -21,6 +29,41 @@ use tokio::{ task::spawn_blocking, }; +use crate::openaddresses::parse_oa_geojson; + +pub async fn populate_admin_areas(poi: &mut AirmailPoi, port: usize) -> Result<(), Box> { + let cell = CellID(poi.s2cell); + let latlng = LatLng::from(cell); + let pip_response = query_pip::query_pip(latlng.lat.deg(), latlng.lng.deg(), port).await?; + let mut locality: Vec = pip_response + .locality + .unwrap_or_default() + .iter() + .map(|a| a.name.to_lowercase()) + .collect(); + if let Some(neighbourhood) = pip_response.neighbourhood { + locality.extend(neighbourhood.iter().map(|a| a.name.to_lowercase())); + } + let region = pip_response + .region + .unwrap_or_default() + .iter() + .map(|a| a.name.to_lowercase()) + .collect(); + let country = pip_response + .country + .unwrap_or_default() + .iter() + .map(|a| a.name.to_lowercase()) + .collect(); + + poi.locality = locality; + poi.region = region; + poi.country = country; + + Ok(()) +} + #[derive(Debug, Parser)] struct Args { /// Path to the Docker socket. @@ -35,6 +78,9 @@ struct Args { /// Path to an OpenAddresses data file. #[clap(long, short)] openaddresses: Option, + /// Path to an OpenStreetMap pbf file. + #[clap(long, short)] + osmpbf: Option, /// Path to the Airmail index. #[clap(long, short)] index: String, @@ -58,7 +104,7 @@ async fn docker_connect() -> Result> { Ok(docker) } -async fn get_container_status() -> Result> { +async fn get_container_status(idx: usize) -> Result> { let docker = docker_connect().await?; let containers = &docker @@ -70,7 +116,7 @@ async fn get_container_status() -> Result Result Result<(), Box> { - let container_state = get_container_status().await?; + let container_state = get_container_status(idx).await?; if container_state == ContainerStatus::Running && !recreate { - println!("Container `airmail-pip-service` is already running."); + println!( + "Container `airmail-pip-service-{}` is already running.", + idx + ); return Ok(()); } @@ -99,10 +149,10 @@ async fn maybe_start_container( env: Some(vec![]), host_config: Some(HostConfig { port_bindings: Some(HashMap::from([( - "3102/tcp".to_string(), + 3102.to_string(), Some(vec![bollard::models::PortBinding { host_ip: None, - host_port: Some("3102".to_string()), + host_port: Some(format!("{}", 3102 + idx)), }]), )])), mounts: Some(vec![bollard::models::Mount { @@ -133,21 +183,27 @@ async fn maybe_start_container( .await?; if recreate { - println!("Stopping container `airmail-pip-service`"); + println!("Stopping container `airmail-pip-service-{}`", idx); let _ = &docker - .stop_container("airmail-pip-service", None::) - .await?; + .stop_container( + &format!("airmail-pip-service-{}", idx), + None::, + ) + .await; let _ = &docker - .remove_container("airmail-pip-service", None::) - .await?; + .remove_container( + &format!("airmail-pip-service-{}", idx), + None::, + ) + .await; } if container_state == ContainerStatus::DoesNotExist || recreate { - println!("Creating container `airmail-pip-service`"); + println!("Creating container `airmail-pip-service-{}`", idx); let _ = &docker .create_container( Some(CreateContainerOptions { - name: "airmail-pip-service", + name: &format!("airmail-pip-service-{}", idx), platform: None, }), pip_config, @@ -155,19 +211,22 @@ async fn maybe_start_container( .await?; } - println!("Starting container `airmail-pip-service`"); + println!("Starting container `airmail-pip-service-{}`", idx); let _ = &docker - .start_container("airmail-pip-service", None::>) + .start_container( + &format!("airmail-pip-service-{}", idx), + None::>, + ) .await?; println!("Waiting for container to start."); tokio::time::sleep(std::time::Duration::from_secs(2)).await; - if get_container_status().await? == ContainerStatus::Running { - println!("Container `airmail-pip-service` is running."); + if get_container_status(idx).await? == ContainerStatus::Running { + println!("Container `airmail-pip-service-{}` is running.", idx); } else { - println!("Container `airmail-pip-service` failed to start."); - return Err("Container `airmail-pip-service` failed to start.".into()); + println!("Container `airmail-pip-service-{}` failed to start.", idx); + return Err(format!("Container `airmail-pip-service-{}` failed to start.", idx).into()); } Ok(()) @@ -177,7 +236,99 @@ async fn maybe_start_container( async fn main() -> Result<(), Box> { env_logger::init(); let args = Args::parse(); - maybe_start_container(&args.wof_db, args.recreate).await?; + let index_path = args.index.clone(); + let max_pip = 4; + for i in 0..max_pip { + let new_wof_file = temp_dir().join(format!("wof-{}.db", i)); + std::fs::copy(&args.wof_db, &new_wof_file)?; + // We don't care if this doesn't work. + let _ = subprocess::Exec::cmd("chcon") + .arg("-t") + .arg("container_file_t") + .arg(&new_wof_file) + .join(); + maybe_start_pip_container(&new_wof_file.to_string_lossy(), i, args.recreate).await?; + } + + if let Some(osmpbf_path) = args.osmpbf { + let mut nonblocking_join_handles = Vec::new(); + let (no_admin_sender, no_admin_receiver): (Sender, Receiver) = + crossbeam::channel::bounded(1024); + let (to_index_sender, to_index_receiver): (Sender, Receiver) = + crossbeam::channel::bounded(1024); + for _ in 0..16 { + let no_admin_receiver = no_admin_receiver.clone(); + let to_index_sender = to_index_sender.clone(); + nonblocking_join_handles.push(spawn(async move { + loop { + let mut poi = if let Ok(poi) = no_admin_receiver.recv() { + poi + } else { + break; + }; + let mut sent = false; + for _attempt in 0..5 { + let port = (rand::random::() % max_pip) + 3102; + if let Err(err) = populate_admin_areas(&mut poi, port).await { + println!("Failed to populate admin areas. {}", err); + } else { + to_index_sender.send(poi).unwrap(); + sent = true; + break; + } + } + if !sent { + println!("Failed to populate admin areas after 5 attempts."); + } + } + })); + } + let index_path = args.index.clone(); + let count = Arc::new(Mutex::new(0)); + let start = std::time::Instant::now(); + let count = count.clone(); + + let indexing_join_handle = spawn(async move { + let mut index = airmail::index::AirmailIndex::create(&index_path).unwrap(); + let mut writer = index.writer().unwrap(); + loop { + { + let mut count = count.lock().await; + *count += 1; + if *count % 1000 == 0 { + println!( + "{} POIs parsed in {} seconds, {} per second.", + *count, + start.elapsed().as_secs(), + *count as f64 / start.elapsed().as_secs_f64() + ); + } + } + + if let Ok(poi) = to_index_receiver.recv() { + if let Err(err) = writer.add_poi(poi) { + println!("Failed to add POI to index. {}", err); + } + } else { + break; + } + } + writer.commit().unwrap(); + }); + + openstreetmap::parse_osm(&osmpbf_path, &mut |poi| { + no_admin_sender.send(poi).unwrap(); + Ok(()) + }) + .unwrap(); + drop(no_admin_sender); + println!("Waiting for tasks to finish."); + for handle in nonblocking_join_handles { + handle.await.unwrap(); + } + drop(to_index_sender); + indexing_join_handle.await.unwrap(); + } if let Some(openaddresses_path) = args.openaddresses { let openaddresses_file = tokio::fs::File::open(openaddresses_path).await?; @@ -244,7 +395,8 @@ async fn main() -> Result<(), Box> { }; let mut sent = false; for _attempt in 0..5 { - if let Err(err) = poi.populate_admin_areas().await { + let port = (rand::random::() % max_pip) + 3102; + if let Err(err) = populate_admin_areas(&mut poi, port).await { println!("Failed to populate admin areas. {}", err); } else { to_index_sender.send(poi).unwrap(); @@ -259,7 +411,7 @@ async fn main() -> Result<(), Box> { })); } let indexing_join_handle = spawn(async move { - let mut index = airmail::index::AirmailIndex::create(&args.index).unwrap(); + let mut index = airmail::index::AirmailIndex::create(&index_path).unwrap(); let mut writer = index.writer().unwrap(); loop { if let Ok(poi) = to_index_receiver.recv() { diff --git a/airmail/src/poi/openaddresses.rs b/airmail_index/src/openaddresses.rs similarity index 89% rename from airmail/src/poi/openaddresses.rs rename to airmail_index/src/openaddresses.rs index 159d711..3d4fad2 100644 --- a/airmail/src/poi/openaddresses.rs +++ b/airmail_index/src/openaddresses.rs @@ -1,3 +1,4 @@ +use airmail_common::categories::PoiCategory; use geojson::{GeoJson, Value}; use super::{ @@ -12,7 +13,7 @@ pub fn parse_oa_geojson( GeoJson::Feature(feature) => { let properties = feature.properties.as_ref().unwrap(); let name = vec![]; - let category = vec![]; + let category = vec![PoiCategory::Address]; let house_number = if let Some(house_num) = properties.get("number").map(|v| v.as_str()).flatten() { permute_housenum(house_num)? @@ -43,12 +44,14 @@ pub fn parse_oa_geojson( }; Ok(AirmailPoi::new( name, + "openaddresses".to_string(), category, house_number, road, unit, lat, lng, + vec![], // OpenAddresses doesn't have tags. :( )?) } _ => Err("Not a feature".into()), diff --git a/airmail_index/src/openstreetmap.rs b/airmail_index/src/openstreetmap.rs new file mode 100644 index 0000000..40609fe --- /dev/null +++ b/airmail_index/src/openstreetmap.rs @@ -0,0 +1,185 @@ +use std::{collections::HashMap, fs::File}; + +use airmail::poi::AirmailPoi; +use airmail_common::categories::{ + AmenityPoiCategory, EmergencyPoiCategory, FoodPoiCategory, PoiCategory, ShopPoiCategory, +}; +use geo::{Centroid, Coord, LineString, Polygon}; +use log::warn; +use osmpbf::Element; + +use crate::substitutions::{permute_housenum, permute_road, permute_unit}; + +fn tags_to_poi(tags: &HashMap<&str, &str>, lat: f64, lng: f64) -> Option { + if tags.contains_key("highway") { + return None; + } + + let category = tags.get("amenity").map(|s| match *s { + "fast_food" | "food_court" | "cafe" | "pub" | "restaurant" => { + PoiCategory::Shop(ShopPoiCategory::Food(FoodPoiCategory::Restaurant(None))) + } + "biergarten" | "bar" => { + PoiCategory::Shop(ShopPoiCategory::Food(FoodPoiCategory::Restaurant(None))) + } + "drinking_water" => PoiCategory::Amenity(AmenityPoiCategory::DrinkingWater), + "toilets" => PoiCategory::Amenity(AmenityPoiCategory::Toilets), + "shelter" => PoiCategory::Amenity(AmenityPoiCategory::Shelter), + "telephone" => PoiCategory::Amenity(AmenityPoiCategory::Telephone), + "bank" | "atm" => PoiCategory::Shop(ShopPoiCategory::Bank), + "pharmacy" => PoiCategory::Shop(ShopPoiCategory::Health), + "hospital" => PoiCategory::Emergency(EmergencyPoiCategory::Hospital), + "clinic" => PoiCategory::Shop(ShopPoiCategory::Clinic), + "dentist" => PoiCategory::Shop(ShopPoiCategory::Clinic), // TODO: subfacet here? + "veterinary" => PoiCategory::Shop(ShopPoiCategory::Veterinary), + "library" => PoiCategory::Amenity(AmenityPoiCategory::Library), + _ => PoiCategory::Address, + }); + + let house_number = tags + .get("addr:housenumber") + .map(|s| permute_housenum(&s).unwrap()) + .unwrap_or_default(); + let road = tags + .get("addr:street") + .map(|s| permute_road(&s).unwrap()) + .unwrap_or_default(); + let unit = tags + .get("addr:unit") + .map(|s| permute_unit(&s).unwrap()) + .unwrap_or_default(); + + let names = { + let mut names = Vec::new(); + tags.iter() + .filter(|(key, _value)| key.contains("name:") || key.to_string() == "name") + .for_each(|(_key, value)| { + names.push(value.to_string()); + }); + names + }; + + if (house_number.is_empty() || road.is_empty()) && names.is_empty() { + return None; + } + + Some( + AirmailPoi::new( + names, + "osm".to_string(), + category.into_iter().collect(), + house_number, + road, + unit, + lat, + lng, + tags.iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(), + ) + .unwrap(), + ) +} + +pub fn parse_osm Result<(), Box>>( + pbf_path: &str, + callback: &CB, +) -> Result<(), Box> { + let file = File::open(pbf_path)?; + let reader = osmpbf::reader::ElementReader::new(file); + let ways_of_interest = reader.par_map_reduce( + |obj| match obj { + osmpbf::Element::Node(node) => { + let tags = node.tags().clone().collect(); + let lat = node.lat(); + let lng = node.lon(); + if let Some(poi) = tags_to_poi(&tags, lat, lng) { + if let Err(err) = callback(poi) { + warn!("Error: {}", err); + } + } + vec![] + } + osmpbf::Element::DenseNode(densenode) => { + let tags = densenode.tags().clone().collect(); + let lat = densenode.lat(); + let lng = densenode.lon(); + if let Some(poi) = tags_to_poi(&tags, lat, lng) { + if let Err(err) = callback(poi) { + warn!("Error: {}", err); + } + } + vec![] + } + osmpbf::Element::Way(way) => { + let tags = way.tags().clone().collect(); + let node_positions: Vec = way + .node_locations() + .map(|node| Coord { + x: node.lon(), + y: node.lat(), + }) + .collect(); + if node_positions.len() < 3 { + return vec![]; + } + let linestring = LineString::new(node_positions); + let polygon = Polygon::new(linestring, vec![]); + let (lng, lat) = polygon.centroid().unwrap().into(); + if let Some(poi) = tags_to_poi(&tags, lat, lng) { + if let Err(err) = callback(poi) { + warn!("Error: {}", err); + } + } + vec![] + } + osmpbf::Element::Relation(relation) => { + if let Some(outer) = relation.members().next() { + let tags: HashMap = relation + .tags() + .map(|(a, b)| (a.to_string(), b.to_string())) + .collect(); + return vec![(outer.member_id, tags)]; + } + vec![] + } + }, + || vec![], + |a, b| { + let mut a = a.clone(); + a.extend(b); + a + }, + )?; + + let file = File::open(pbf_path)?; + let reader = osmpbf::reader::ElementReader::new(file); + let ways_of_interest = ways_of_interest.into_iter().collect::>(); + reader.for_each(|obj| { + if let Element::Way(way) = obj { + if ways_of_interest.contains_key(&way.id()) { + let tags = ways_of_interest.get(&way.id()).unwrap(); + let node_positions: Vec = way + .node_locations() + .map(|node| Coord { + x: node.lon(), + y: node.lat(), + }) + .collect(); + if node_positions.len() < 3 { + return; + } + let linestring = LineString::new(node_positions); + let polygon = Polygon::new(linestring, vec![]); + let (lng, lat) = polygon.centroid().unwrap().into(); + let tags_str = tags.iter().map(|(k, v)| (k.as_str(), v.as_str())).collect(); + if let Some(poi) = tags_to_poi(&tags_str, lat, lng) { + if let Err(err) = callback(poi) { + warn!("Error: {}", err); + } + } + } + } + })?; + Ok(()) +} diff --git a/airmail/src/poi/query_pip.rs b/airmail_index/src/query_pip.rs similarity index 78% rename from airmail/src/poi/query_pip.rs rename to airmail_index/src/query_pip.rs index 11579b6..1242ad1 100644 --- a/airmail/src/poi/query_pip.rs +++ b/airmail_index/src/query_pip.rs @@ -11,6 +11,7 @@ pub struct PipAdminArea { #[derive(Debug, Clone, Deserialize)] pub struct PipResponse { pub locality: Option>, + pub neighbourhood: Option>, pub county: Option>, pub region: Option>, pub country: Option>, @@ -20,8 +21,8 @@ thread_local! { static HTTP_CLIENT: reqwest::Client = reqwest::Client::new(); } -pub async fn query_pip(lat: f64, lng: f64) -> Result> { - let url = format!("http://localhost:3102/{}/{}", lng, lat); +pub async fn query_pip(lat: f64, lng: f64, port: usize) -> Result> { + let url = format!("http://localhost:{}/{}/{}", port, lng, lat); let response = HTTP_CLIENT.with(|client| client.get(&url).send()).await?; if response.status() != 200 { return Err(format!("HTTP error: {}", response.status()).into()); diff --git a/airmail/src/poi/substitutions.rs b/airmail_index/src/substitutions.rs similarity index 97% rename from airmail/src/poi/substitutions.rs rename to airmail_index/src/substitutions.rs index feffe9a..e568ca0 100644 --- a/airmail/src/poi/substitutions.rs +++ b/airmail_index/src/substitutions.rs @@ -6,8 +6,7 @@ use regex::Regex; lazy_static! { static ref ASCII_WHITESPACE_RE: Regex = Regex::new(r"[ \t\r\n]+").unwrap(); static ref STREET_SUFFIXES_SUBS: SubstitutionDict = - SubstitutionDict::from_str(include_str!("../../permute_dicts/en/street_types.txt")) - .unwrap(); + SubstitutionDict::from_str(include_str!("../permute_dicts/en/street_types.txt")).unwrap(); } pub(super) struct SubstitutionDict { diff --git a/airmail_parser/src/component.rs b/airmail_parser/src/component.rs index 8be2b99..a689350 100644 --- a/airmail_parser/src/component.rs +++ b/airmail_parser/src/component.rs @@ -4,7 +4,10 @@ use crate::{ common::{query_sep, query_term}, fst::parse_fst, }; -use airmail_common::{dicts::*, fst::FstMatchMode}; +use airmail_common::{ + dicts::*, + fst::{search_fst, FstMatchMode}, +}; use nom::{bytes::complete::take_while, IResult}; #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] @@ -185,8 +188,6 @@ define_component!( |_| 1.0f32 ); -// define_component!(HouseNameComponent); - fn parse_house_number(text: &str) -> IResult<&str, &str> { // TODO: This should be more general. Not all house numbers are numbers. take_while(|c: char| c.is_ascii_digit())(text) @@ -307,11 +308,89 @@ fn parse_sublocality(text: &str) -> IResult<&str, &str> { define_component!(SublocalityComponent, parse_sublocality, |_| 0.9f32); -fn parse_locality(text: &str) -> IResult<&str, &str> { - parse_fst(&localities_fst(), FstMatchMode::GreedyLevenshtein(0), text) +#[derive(Debug, Clone)] +pub struct LocalityComponent { + text: String, } -define_component!(LocalityComponent, parse_locality, |_| 1.5f32); +impl LocalityComponent { + fn new(text: String) -> Self { + Self { text } + } + + pub fn parse(text: &str) -> Vec<(Self, &str)> { + let mut scenarios = Vec::new(); + let mut substring_len = if let Ok((_, token)) = query_term(text) { + token.len() + } else { + return scenarios; + }; + + scenarios.push(( + Self::new(text[..substring_len].to_string()), + &text[substring_len..], + )); + + let mut sep_len = if let Ok((_, sep)) = query_sep(&text[substring_len..]) { + sep.len() + } else { + return scenarios; + }; + + loop { + substring_len += if let Ok((_, token)) = query_term(&text[substring_len + sep_len..]) { + if token.is_empty() { + break; + } + token.len() + } else { + break; + }; + substring_len += sep_len; + scenarios.push(( + Self::new(text[..substring_len].to_string()), + &text[substring_len..], + )); + if let Ok((_, sep)) = query_sep(&text[substring_len..]) { + sep_len = sep.len(); + } else { + break; + } + } + return scenarios; + } + + pub fn parse_boxed(text: &str) -> Vec<(Arc, &str)> { + Self::parse(text) + .into_iter() + .map(|(component, remainder)| { + (Arc::new(component) as Arc, remainder) + }) + .collect() + } +} + +impl QueryComponent for LocalityComponent { + fn text(&self) -> &str { + &self.text + } + + fn penalty_mult(&self) -> f32 { + if search_fst(localities_fst(), self.text.clone(), 0, false) { + 1.5f32 + } else { + 0.75f32 + } + } + + fn debug_name(&self) -> &'static str { + "LocalityComponent" + } + + fn component_type(&self) -> QueryComponentType { + QueryComponentType::LocalityComponent + } +} fn parse_region(text: &str) -> IResult<&str, &str> { parse_fst(®ions_fst(), FstMatchMode::GreedyLevenshtein(0), text) diff --git a/airmail_parser/src/scorers.rs b/airmail_parser/src/scorers.rs index 35d48b5..f6e4cb9 100644 --- a/airmail_parser/src/scorers.rs +++ b/airmail_parser/src/scorers.rs @@ -161,16 +161,19 @@ fn housenum_not_before_placename(scenario: &QueryScenario) -> f32 { fn naked_road_unlikely(scenario: &QueryScenario) -> f32 { let mut has_road = false; - let mut has_house_num = false; + let mut has_other = false; for component in scenario.as_vec() { if component.component_type() == QueryComponentType::RoadComponent { has_road = true; } if component.component_type() == QueryComponentType::HouseNumberComponent { - has_house_num = true; + has_other = true; + } + if component.component_type() == QueryComponentType::PlaceNameComponent { + has_other = true; } } - if has_road && !has_house_num { + if has_road && !has_other { return 0.05; } 1.0