Skip to content

Commit

Permalink
support custom shortcodes
Browse files Browse the repository at this point in the history
support custom shortcodes
  • Loading branch information
Mindful committed Nov 3, 2024
1 parent e47ab9b commit ce05cfe
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 18 deletions.
12 changes: 9 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
# Hunspell and wordfreq data
en_US.aff
en_US.dic
hunspell*
README_en_US.txt

count_1w.txt


*.m4
*.in
/config.*
Expand All @@ -8,9 +17,6 @@ stamp-*
*.xml
.cmake

hunspell_US.txt
count_1w.txt

CMakeFiles/
*.cbp
cmake_install.cmake
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ wget https://norvig.com/ngrams/count_1w.txt
Finally, run the preprocessing script
```shell
cd src/predict
cargo run --package preproc --bin preproc
# Can also pass just one of "symbols" or "dictionary" to regenerate only that portion
cargo run --package preproc --bin preproc symbols dictionary
```

This will generate `dictionary.fst`, `shortcodes.fst` and `symbols.bin`.
Expand Down
1 change: 1 addition & 0 deletions src/predict/custom_shortcodes.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
lenny ( ͡° ͜ʖ ͡°)
78 changes: 64 additions & 14 deletions src/predict/preproc/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use fst::MapBuilder;
use std::collections::{HashMap, HashSet};
use std::error;
use std::fmt::{Display, Formatter};
use std::fs::File;
use std::io;
use std::io::{BufRead, Write};
use std::num::ParseIntError;
use std::{env, error};

#[derive(Debug, Clone)]
#[allow(dead_code)]
Expand Down Expand Up @@ -65,15 +65,19 @@ fn math_symbol_shortcodes() -> Vec<(String, String)> {
.has_headers(false)
.from_reader(reader);

rdr.into_records()
let output: Vec<(String, String)> = rdr
.into_records()
.filter_map(|result| {
result.ok().map(|record| {
let symbol = &record[2];
(String::from(&record[3]), String::from(symbol)) //shortcode, symbol
})
})
.filter(|(_shortcode, symbol)| whitelist.contains(symbol))
.collect()
.collect();

println!("Found {} math symbols", output.len());
output
}

fn github_emoji_shortcodes() -> Vec<(String, String)> {
Expand All @@ -85,7 +89,8 @@ fn github_emoji_shortcodes() -> Vec<(String, String)> {

//have to filter out bad URLs like
// "https://github.githubassets.com/images/icons/emoji/bowtie.png?v8"
json.iter()
let output: Vec<(String, String)> = json
.iter()
.filter_map(|(key, url)| {
let key_chars: Vec<char> = key.chars().collect();
if key_chars.first().map(|c| c == &'u').unwrap_or(false)
Expand All @@ -101,7 +106,31 @@ fn github_emoji_shortcodes() -> Vec<(String, String)> {
.ok()
}
})
.collect::<Vec<(String, String)>>()
.collect::<Vec<(String, String)>>();
println!("Found {} github emoji shortcodes", output.len());
output
}

fn custom_shortcodes() -> Vec<(String, String)> {
//Read shortcode, sysmbol from custom_shortcodes.tsv
let reader = io::BufReader::new(
File::open("custom_shortcodes.tsv").expect("Failed to open custom_shortcodes.tsv"),
);
let mut rdr = csv::ReaderBuilder::new()
.delimiter(b'\t')
.has_headers(false)
.from_reader(reader);

let output: Vec<(String, String)> = rdr
.records()
.map(|result| {
let record = result.expect("Failed to parse a record in custom_shortcodes.tsv");
(record[0].to_string(), record[1].to_string())
})
.collect();

println!("Found {} custom shortcodes", output.len());
output
}

fn write_symbols_and_shortcodes(
Expand Down Expand Up @@ -213,18 +242,39 @@ fn process_dictionary() -> Result<(), Box<dyn error::Error>> {
}

fn main() -> Result<(), Box<dyn error::Error>> {
println!("Fetching math symbols");
let math_symbols = math_symbol_shortcodes();
let args: HashSet<String> = env::args().collect();

if args.contains("symbols") {
println!("-- Processing symbols and shortcodes --");
println!("Fetching math symbols");
let math_shortcodes = math_symbol_shortcodes();

println!("Fetching shortcodes from github");
let shortcodes = github_emoji_shortcodes();
println!("Fetching shortcodes from github");
let github_shortcodes = github_emoji_shortcodes();

let all_symbols = [math_symbols, shortcodes].concat();
let custom_shortcodes = custom_shortcodes();
let all_symbols = [math_shortcodes, github_shortcodes, custom_shortcodes].concat();

println!("Writing symbols and shortcodes to files");
write_symbols_and_shortcodes(all_symbols)?;
println!("Processing dictionary");
process_dictionary()?;
let shortcode_set = all_symbols
.iter()
.map(|(shortcode, _)| shortcode)
.collect::<HashSet<&String>>();

if shortcode_set.len() != all_symbols.len() {
return Err("Shortcode collision detected".into());
} else {
println!("No shortcode collisions detected");
}

println!("Writing symbols and shortcodes to files");
write_symbols_and_shortcodes(all_symbols)?;
println!("-- Done processing symbols and shortcodes --");
}
if args.contains("dictionary") {
println!("-- Processing dictionary --");
process_dictionary()?;
println!("-- Done processing dictionary --");
}

Ok(())
}
Binary file modified src/predict/shortcodes.fst
Binary file not shown.
Binary file modified src/predict/symbols.bin
Binary file not shown.

0 comments on commit ce05cfe

Please sign in to comment.