Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ellenhp committed Jan 27, 2024
0 parents commit ee2402b
Show file tree
Hide file tree
Showing 32 changed files with 51,738 additions and 0 deletions.
16 changes: 16 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Generated by Cargo
# will have compiled files and executables
debug/
target/

# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
Cargo.lock

# These are backup files generated by rustfmt
**/*.rs.bk

# MSVC Windows builds of rustc generate these, which store debugging information
*.pdb

**.geojson
6 changes: 6 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[workspace]
resolver = "2"
members = [
"airmail",
"airmail_parser",
]
25 changes: 25 additions & 0 deletions LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
The MIT License (MIT)
=====================

Copyright 2024 Ellen Poe

Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation
files (the “Software”), to deal in the Software without
restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following
conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Ellen writes a geocoder

Airmail might eventually become a hybrid online/offline geocoder. This is a forever project and a pipe dream, you probably shouldn't use it yet and I may never finish it.

For now, the parser is reasonably good.
19 changes: 19 additions & 0 deletions airmail/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[package]
name = "airmail"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
clap = { version = "4.4.18", features = ["derive"] }
geojson = "0.24.1"
levenshtein_automata = "0.2.1"
s2 = "0.0.12"
tantivy = "0.21.1"
tantivy-common = "0.6.0"
tantivy-fst = "0.4.0"
tempfile = "3.9.0"

[[bin]]
name = "index"
3 changes: 3 additions & 0 deletions airmail/src/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
### Here be dragons

This crate is a mess and does almost nothing useful.
15 changes: 15 additions & 0 deletions airmail/src/bin/index.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
use clap::Parser;

#[derive(Parser, Debug)]
struct Args {
/// The GeoJSON file to index.
#[clap(short, long)]
geojson: Option<String>,
/// The directory to output index tiles into.
#[clap(short, long)]
index_dir: String,
}

fn main() -> Result<(), Box<dyn std::error::Error>> {
Ok(())
}
119 changes: 119 additions & 0 deletions airmail/src/index.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
use tantivy::{
collector::{Count, TopDocs},
schema::{Schema, INDEXED, STORED, TEXT},
};

use crate::{poi::AirmailPoi, query::AirmailQuery};

// Field name keys.
pub const FIELD_NAME: &str = "name";
pub const FIELD_CATEGORY: &str = "category";
pub const FIELD_HOUSE_NUMBER: &str = "house_number";
pub const FIELD_ROAD: &str = "road";
pub const FIELD_UNIT: &str = "unit";
pub const FIELD_LOCALITY: &str = "locality";
pub const FIELD_REGION: &str = "region";
pub const FIELD_S2CELL: &str = "s2cell";

pub struct AirmailIndex {
tantivy_index: tantivy::Index,
}

impl AirmailIndex {
fn schema() -> tantivy::schema::Schema {
let mut schema_builder = Schema::builder();
let _ = schema_builder.add_text_field(FIELD_NAME, TEXT | STORED);
let _ = schema_builder.add_text_field(FIELD_CATEGORY, TEXT | STORED);
let _ = schema_builder.add_text_field(FIELD_HOUSE_NUMBER, TEXT | STORED);
let _ = schema_builder.add_text_field(FIELD_ROAD, TEXT | STORED);
let _ = schema_builder.add_text_field(FIELD_UNIT, TEXT | STORED);
let _ = schema_builder.add_text_field(FIELD_LOCALITY, TEXT | STORED);
let _ = schema_builder.add_text_field(FIELD_REGION, TEXT | STORED);
let _ = schema_builder.add_u64_field(FIELD_S2CELL, INDEXED | STORED);
schema_builder.build()
}

pub fn field_name(&self) -> tantivy::schema::Field {
self.tantivy_index.schema().get_field(FIELD_NAME).unwrap()
}

pub fn field_house_number(&self) -> tantivy::schema::Field {
self.tantivy_index
.schema()
.get_field(FIELD_HOUSE_NUMBER)
.unwrap()
}

pub fn field_road(&self) -> tantivy::schema::Field {
self.tantivy_index.schema().get_field(FIELD_ROAD).unwrap()
}

pub fn field_unit(&self) -> tantivy::schema::Field {
self.tantivy_index.schema().get_field(FIELD_UNIT).unwrap()
}

pub fn field_locality(&self) -> tantivy::schema::Field {
self.tantivy_index
.schema()
.get_field(FIELD_LOCALITY)
.unwrap()
}

pub fn field_region(&self) -> tantivy::schema::Field {
self.tantivy_index.schema().get_field(FIELD_REGION).unwrap()
}

pub fn create(index_dir: &str) -> Result<Self, Box<dyn std::error::Error>> {
let schema = Self::schema();
let tantivy_index = tantivy::Index::create_in_dir(index_dir, schema)?;
Ok(Self { tantivy_index })
}

pub fn new(index_dir: &str) -> Result<Self, Box<dyn std::error::Error>> {
let tantivy_index = tantivy::Index::open_in_dir(index_dir)?;
Ok(Self { tantivy_index })
}

pub fn writer(&mut self) -> Result<AirmailIndexWriter, Box<dyn std::error::Error>> {
let tantivy_writer = self.tantivy_index.writer(50_000_000)?;
let writer = AirmailIndexWriter { tantivy_writer };
Ok(writer)
}

pub fn search(&self, query: AirmailQuery) -> Result<Vec<String>, Box<dyn std::error::Error>> {
let tantivy_reader = self.tantivy_index.reader()?;
let searcher = tantivy_reader.searcher();
let results = searcher.search(&query, &(TopDocs::with_limit(5), Count))?;
let strings = results
.0
.iter()
.map(|s| format!("{:?}", searcher.doc(s.1).unwrap()))
.collect();
Ok(strings)
}
}

pub struct AirmailIndexWriter {
tantivy_writer: tantivy::IndexWriter,
}

impl AirmailIndexWriter {
pub fn add_poi(&mut self, poi: AirmailPoi) -> Result<(), Box<dyn std::error::Error>> {
let mut document = tantivy::Document::new();
let schema = self.tantivy_writer.index().schema();
if let Some(name) = poi.name {
document.add_text(schema.get_field(FIELD_NAME)?, name);
}
if let Some(category) = poi.category {
document.add_text(schema.get_field(FIELD_CATEGORY)?, category);
}
document.add_u64(schema.get_field(FIELD_S2CELL)?, poi.s2cell);
self.tantivy_writer.add_document(document)?;
Ok(())
}

pub fn commit(mut self) -> Result<(), Box<dyn std::error::Error>> {
self.tantivy_writer.commit()?;
Ok(())
}
}
4 changes: 4 additions & 0 deletions airmail/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
pub mod index;
pub mod parser;
pub mod poi;
pub mod query;
1 change: 1 addition & 0 deletions airmail/src/parser.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

65 changes: 65 additions & 0 deletions airmail/src/poi.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
use geojson::{GeoJson, Value};

fn sanitize_oa_field(field: Option<&str>) -> Option<String> {
field.map(|field| {
let field = field.to_lowercase();
let parts: Vec<_> = field.split_whitespace().collect();
parts.join(" ")
})
}
pub struct AirmailPoi {
pub name: Option<String>,
pub category: Option<String>,
pub house_number: Option<String>,
pub road: Option<String>,
pub unit: Option<String>,
pub locality: Option<String>,
pub region: Option<String>,
pub s2cell: u64,
}

impl AirmailPoi {
pub fn from_openaddresses_geojson(
object: &geojson::GeoJson,
) -> Result<Self, Box<dyn std::error::Error>> {
match object {
GeoJson::Feature(feature) => {
let properties = feature.properties.as_ref().unwrap();
let name = None;
let category = None;
let house_number =
sanitize_oa_field(properties.get("number").map(|v| v.as_str()).flatten());
let road =
sanitize_oa_field(properties.get("street").map(|v| v.as_str()).flatten());
let unit = sanitize_oa_field(properties.get("unit").map(|v| v.as_str()).flatten());
let locality =
sanitize_oa_field(properties.get("city").map(|v| v.as_str()).flatten());

let s2cell = match &feature.geometry {
Some(geometry) => match &geometry.value {
Value::Point(point) => {
let lat = point[1];
let lng = point[0];
let s2cell: s2::cellid::CellID =
s2::latlng::LatLng::from_degrees(lat, lng).into();
s2cell
}
_ => panic!(),
},
None => panic!(),
};
Ok(Self {
name,
category,
house_number,
road,
unit,
locality,
region: None,
s2cell: s2cell.0,
})
}
_ => Err("Not a feature".into()),
}
}
}
Loading

0 comments on commit ee2402b

Please sign in to comment.