Skip to content

Commit

Permalink
Merge pull request #3 from disasterscience/structure
Browse files Browse the repository at this point in the history
Structure
  • Loading branch information
tristan-morris authored Jul 25, 2024
2 parents 500283b + a451220 commit 1c32229
Show file tree
Hide file tree
Showing 23 changed files with 1,146 additions and 627 deletions.
766 changes: 641 additions & 125 deletions Cargo.lock

Large diffs are not rendered by default.

4 changes: 1 addition & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
[workspace]
resolver = "2"
members = [
"airmail", "airmail_import_osm", "airmail_indexer", "airmail_service",
]
members = ["airmail", "airmail_indexer", "airmail_service"]

[profile.release]
debug = 1
1 change: 0 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ RUN apt update && apt install -y libssl-dev clang pkg-config
WORKDIR /usr/src/airmail
COPY ./airmail ./airmail
COPY ./airmail_indexer ./airmail_indexer
COPY ./airmail_import_osm ./airmail_import_osm
COPY ./airmail_service ./airmail_service
COPY ./Cargo.toml ./Cargo.toml
COPY ./Cargo.lock ./Cargo.lock
Expand Down
5 changes: 1 addition & 4 deletions Dockerfile.build
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,10 @@ RUN apt-get update && apt-get install -y libssl-dev capnproto clang pkg-config l
WORKDIR /usr/src/airmail
COPY ./airmail ./airmail
COPY ./airmail_indexer ./airmail_indexer
COPY ./airmail_import_osm ./airmail_import_osm
COPY ./airmail_service ./airmail_service
COPY ./Cargo.toml ./Cargo.toml
COPY ./Cargo.lock ./Cargo.lock

RUN cargo install --path ./airmail_import_osm

RUN apt-get update && apt-get install -y podman
RUN cargo install --path ./airmail_indexer

WORKDIR /var/airmail
2 changes: 1 addition & 1 deletion QUICKSTART.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ ls -lh ./data/whosonfirst-data-admin-latest.spatial.db ./data/australia-latest.o

# Build the index
docker compose --profile index run build-index \
airmail_import_osm --wof-db /data/whosonfirst-data-admin-latest.spatial.db \
indexer --wof-db /data/whosonfirst-data-admin-latest.spatial.db \
--index /data/index \
--osmx /data/australia-latest.osm.osmx
```
Expand Down
2 changes: 2 additions & 0 deletions airmail/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ lazy_static = "1.4.0"
regex = "1.10.3"
geo = "0.27.0"
tantivy-uffd = "0.1.1"
anyhow = "1.0.86"
thiserror = "1.0.63"

[features]
invasive_logging = []
Expand Down
7 changes: 7 additions & 0 deletions airmail/src/error.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
use thiserror::Error;

#[derive(Error, Debug)]
pub enum AirmailError {
#[error("unable to count")]
UnableToCount,
}
89 changes: 43 additions & 46 deletions airmail/src/index.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::path::PathBuf;
use std::sync::Arc;

use anyhow::Result;
use futures_util::future::join_all;
use geo::Rect;
use itertools::Itertools;
Expand All @@ -24,6 +26,7 @@ use tantivy_uffd::RemoteDirectory;
use tokio::task::spawn_blocking;
use unicode_segmentation::UnicodeSegmentation;

use crate::error::AirmailError;
use crate::{
poi::{AirmailPoi, SchemafiedPoi},
query::all_subsequences,
Expand Down Expand Up @@ -63,8 +66,8 @@ impl AirmailIndex {
.set_indexed()
.set_stored()
.set_fast();
assert_eq!(s2cell_parent_index_options.fieldnorms(), false);
assert_eq!(s2cell_index_options.fieldnorms(), false);
assert!(!s2cell_parent_index_options.fieldnorms());
assert!(!s2cell_index_options.fieldnorms());

let _ = schema_builder.add_text_field(FIELD_CONTENT, text_options.clone());
let _ = schema_builder.add_text_field(FIELD_INDEXED_TAG, tag_options);
Expand Down Expand Up @@ -109,7 +112,7 @@ impl AirmailIndex {
self.tantivy_index.schema().get_field(FIELD_TAGS).unwrap()
}

pub fn create(index_dir: &str) -> Result<Self, Box<dyn std::error::Error>> {
pub fn create(index_dir: &PathBuf) -> Result<Self> {
let schema = Self::schema();
let tantivy_index =
tantivy::Index::open_or_create(MmapDirectory::open(index_dir)?, schema)?;
Expand All @@ -119,15 +122,15 @@ impl AirmailIndex {
})
}

pub fn new(index_dir: &str) -> Result<Self, Box<dyn std::error::Error>> {
pub fn new(index_dir: &str) -> Result<Self> {
let tantivy_index = tantivy::Index::open_in_dir(index_dir)?;
Ok(Self {
tantivy_index: Arc::new(tantivy_index),
is_remote: false,
})
}

pub fn new_remote(base_url: &str) -> Result<Self, Box<dyn std::error::Error>> {
pub fn new_remote(base_url: &str) -> Result<Self> {
let tantivy_index =
tantivy::Index::open(RemoteDirectory::<{ 2 * 1024 * 1024 }>::new(base_url))?;
Ok(Self {
Expand All @@ -136,7 +139,7 @@ impl AirmailIndex {
})
}

pub fn writer(&mut self) -> Result<AirmailIndexWriter, Box<dyn std::error::Error>> {
pub fn writer(&mut self) -> Result<AirmailIndexWriter> {
let tantivy_writer = self
.tantivy_index
.writer::<TantivyDocument>(2_000_000_000)?;
Expand All @@ -147,7 +150,7 @@ impl AirmailIndex {
Ok(writer)
}

pub async fn merge(&mut self) -> Result<(), Box<dyn std::error::Error>> {
pub async fn merge(&mut self) -> Result<()> {
let ids = self.tantivy_index.searchable_segment_ids()?;
self.tantivy_index
.writer::<TantivyDocument>(2_000_000_000)?
Expand All @@ -156,7 +159,7 @@ impl AirmailIndex {
Ok(())
}

pub async fn num_docs(&self) -> Result<u64, Box<dyn std::error::Error>> {
pub async fn num_docs(&self) -> Result<u64> {
let index = self.tantivy_index.clone();
let count = spawn_blocking(move || {
if let Ok(tantivy_reader) = index.reader() {
Expand All @@ -165,7 +168,7 @@ impl AirmailIndex {
None
}
});
Ok(count.await?.ok_or("Error getting count")?)
Ok(count.await?.ok_or(AirmailError::UnableToCount)?)
}

async fn construct_query(
Expand Down Expand Up @@ -244,37 +247,35 @@ impl AirmailIndex {
boost,
)));
}
} else if possible_query.len() >= 8 && lenient {
let query = if tokens.ends_with(&[possible_query]) {
FuzzyTermQuery::new_prefix(term, 1, true)
} else {
FuzzyTermQuery::new(term, 1, true)
};
if self.is_remote {
let searcher = searcher.clone();
let query = query.clone();
spawn_blocking(move || {
let _ = searcher.search(&query, &Count);
});
}
mandatory_queries.push(Box::new(BoostQuery::new(Box::new(query), boost)));
} else {
if possible_query.len() >= 8 && lenient {
let query = if tokens.ends_with(&[possible_query]) {
FuzzyTermQuery::new_prefix(term, 1, true)
let query: Box<dyn Query> =
if self.is_remote || !lenient || !tokens.ends_with(&[possible_query]) {
Box::new(TermQuery::new(term, IndexRecordOption::Basic))
} else {
FuzzyTermQuery::new(term, 1, true)
Box::new(FuzzyTermQuery::new_prefix(term, 0, false))
};
if self.is_remote {
let searcher = searcher.clone();
let query = query.clone();
spawn_blocking(move || {
let _ = searcher.search(&query, &Count);
});
}
mandatory_queries.push(Box::new(BoostQuery::new(Box::new(query), boost)));
} else {
let query: Box<dyn Query> =
if self.is_remote || !lenient || !tokens.ends_with(&[possible_query]) {
Box::new(TermQuery::new(term, IndexRecordOption::Basic))
} else {
Box::new(FuzzyTermQuery::new_prefix(term, 0, false))
};
if self.is_remote {
let searcher = searcher.clone();
let query = query.box_clone();
spawn_blocking(move || {
let _ = searcher.search(&query, &Count);
});
}
mandatory_queries.push(Box::new(BoostQuery::new(query, boost)));
};
if self.is_remote {
let searcher = searcher.clone();
let query = query.box_clone();
spawn_blocking(move || {
let _ = searcher.search(&query, &Count);
});
}
mandatory_queries.push(Box::new(BoostQuery::new(query, boost)));
}
}

Expand Down Expand Up @@ -328,7 +329,7 @@ impl AirmailIndex {
]));
}

return Box::new(final_query);
Box::new(final_query)
}

/// This is public because I don't want one big mega-crate but its API should not be considered even remotely stable.
Expand All @@ -339,7 +340,7 @@ impl AirmailIndex {
tags: Option<Vec<String>>,
bbox: Option<Rect<f64>>,
boost_regions: &[(f32, Rect<f64>)],
) -> Result<Vec<(AirmailPoi, f32)>, Box<dyn std::error::Error>> {
) -> Result<Vec<(AirmailPoi, f32)>> {
let tantivy_reader = self.tantivy_index.reader()?;
let searcher = tantivy_reader.searcher();
let query_string = query.trim().replace("'s", "s");
Expand All @@ -352,7 +353,7 @@ impl AirmailIndex {
&query_string,
tags,
bbox,
&boost_regions,
boost_regions,
request_leniency,
)
.await;
Expand Down Expand Up @@ -424,11 +425,7 @@ impl AirmailIndexWriter {
doc.add_text(self.schema.get_field(FIELD_CONTENT).unwrap(), value);
}

pub async fn add_poi(
&mut self,
poi: SchemafiedPoi,
source: &str,
) -> Result<(), Box<dyn std::error::Error>> {
pub fn add_poi(&mut self, poi: SchemafiedPoi, source: &str) -> Result<()> {
let mut doc = TantivyDocument::default();
for content in poi.content {
self.process_field(&mut doc, &content);
Expand Down Expand Up @@ -468,7 +465,7 @@ impl AirmailIndexWriter {
Ok(())
}

pub fn commit(mut self) -> Result<(), Box<dyn std::error::Error>> {
pub fn commit(mut self) -> Result<()> {
self.tantivy_writer.commit()?;
Ok(())
}
Expand Down
1 change: 1 addition & 0 deletions airmail/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#[macro_use]
extern crate lazy_static;

pub mod error;
pub mod index;
pub mod poi;
pub mod query;
Expand Down
12 changes: 3 additions & 9 deletions airmail/src/poi.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::error::Error;

use anyhow::Result;
use lingua::Language;
use serde::{Deserialize, Serialize};

Expand All @@ -16,12 +15,7 @@ pub struct AirmailPoi {
}

impl AirmailPoi {
pub fn new(
source: String,
lat: f64,
lng: f64,
tags: Vec<(String, String)>,
) -> Result<Self, Box<dyn Error>> {
pub fn new(source: String, lat: f64, lng: f64, tags: Vec<(String, String)>) -> Result<Self> {
let s2cell = s2::cellid::CellID::from(s2::latlng::LatLng::from_degrees(lat, lng)).0;

Ok(Self {
Expand Down Expand Up @@ -56,7 +50,7 @@ impl ToIndexPoi {
lat: f64,
lng: f64,
tags: Vec<(String, String)>,
) -> Result<Self, Box<dyn Error>> {
) -> Result<Self> {
let s2cell = s2::cellid::CellID::from(s2::latlng::LatLng::from_degrees(lat, lng)).0;

Ok(Self {
Expand Down
2 changes: 1 addition & 1 deletion airmail/src/substitutions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ pub fn permute_road(road: &str, language: &Language) -> Result<Vec<String>, Box<
.split_ascii_whitespace()
.map(|s| s.to_string())
.collect();
apply_subs(&vec![], &road_tokens, sub_dict)
apply_subs(&[], &road_tokens, sub_dict)
}

#[cfg(test)]
Expand Down
18 changes: 0 additions & 18 deletions airmail_import_osm/Cargo.toml

This file was deleted.

Loading

0 comments on commit 1c32229

Please sign in to comment.