diff --git a/stm_graphql/Cargo.toml b/stm_graphql/Cargo.toml index 29c6951..50752ed 100644 --- a/stm_graphql/Cargo.toml +++ b/stm_graphql/Cargo.toml @@ -13,6 +13,7 @@ tracing-subscriber = "0.3" log = "0.4" lambda_runtime = { git = "https://github.com/awslabs/aws-lambda-rust-runtime.git" } lambda-debug-proxy-client = {git = "https://github.com/rimutaka/lambda-debug-proxy.git"} +simple-error = "0.2" hyper = { version = "0.14", features = ["http2"] } hyper-rustls = "0.23" regex = "1.5" diff --git a/stm_graphql/src/config.rs b/stm_graphql/src/config.rs index 5788391..21db5d0 100644 --- a/stm_graphql/src/config.rs +++ b/stm_graphql/src/config.rs @@ -63,13 +63,13 @@ const SEARCH_TERM_REGEX: &str = r#"[#:\-._+0-9a-zA-Z]+"#; const NO_SQL_STRING_INVALIDATION_REGEX: &str = r#"[^#\-._+0-9a-zA-Z]"#; impl Config { - // /// The maximum number of dev listings per page of search results - // pub const MAX_DEV_LISTINGS_PER_SEARCH_RESULT: usize = 50; + /// The maximum number of dev listings per page of search results + pub const MAX_DEV_LISTINGS_PER_SEARCH_RESULT: usize = 50; - // /// The maximum number of pages allowed in search. - // /// Check HTML templates if changing the limits on page numbers - // /// 20 is hardcoded in some of the logic there - // pub const MAX_PAGES_PER_SEARCH_RESULT: usize = 20; + /// The maximum number of pages allowed in search. + /// Check HTML templates if changing the limits on page numbers + /// 20 is hardcoded in some of the logic there + pub const MAX_PAGES_PER_SEARCH_RESULT: usize = 20; pub fn new() -> Self { let aws_region = AwsRegion::from_str( diff --git a/stm_graphql/src/elastic.rs b/stm_graphql/src/elastic.rs new file mode 100644 index 0000000..6125e4b --- /dev/null +++ b/stm_graphql/src/elastic.rs @@ -0,0 +1,382 @@ +//use elasticsearch::{http::transport::Transport, CountParts, Elasticsearch, SearchParts}; +use crate::config::Config; +use futures::future::{join, join_all}; +use regex::Regex; +use serde_json::Value; +use std::collections::HashMap; +use stm_shared::elastic::types as es_types; +use stm_shared::elastic::{call_es_api, search}; +use tracing::error; + +pub const SEARCH_ENGINEER_BY_LOGIN: &str = r#"{"query":{"term":{"login.keyword":{"value":"%"}}}}"#; +pub const SEARCH_DEV_BY_DOC_ID: &str = r#"{"query":{"term":{"_id":"%"}}}"#; +pub const SEARCH_ALL_LANGUAGES: &str = + r#"{"size":0,"aggs":{"agg":{"terms":{"field":"report.tech.language.keyword","size":1000}}}}"#; + +/// Inserts a single param in the ES query in place of %. The param may be repeated within the query multiple times. +/// Panics if the param is unsafe for no-sql queries. +pub(crate) fn add_param(query: &str, param: String, no_sql_string_invalidation_regex: &Regex) -> String { + // validate the param + if no_sql_string_invalidation_regex.is_match(¶m) { + panic!("Unsafe param value: {}", param); + } + + let mut modded_query = query.to_string(); + + // loop through the query until there are no more % to replace + while modded_query.contains("%") { + let (left, right) = modded_query.split_at(modded_query.find("%").expect("Cannot split the query")); + + modded_query = [left, param.as_str(), &right[1..]].concat().to_string(); + } + + modded_query +} + +/// Returns the number of ES docs that match the query. The field name is not validated or sanitized. +/// Returns an error if the field value contains anything other than alphanumerics and `.-_`. +pub(crate) async fn matching_doc_count( + es_url: &String, + idx: &String, + field: &str, + field_value: &String, + no_sql_string_invalidation_regex: &Regex, +) -> Result { + // validate field_value for possible no-sql injection + if no_sql_string_invalidation_regex.is_match(field_value) { + error!("Invalid field_value: {}", field_value); + return Err(()); + } + + // the query must be build inside this fn to get a consistent response + let query = [ + r#"{"query":{"match":{""#, + field, + r#"":""#, + field_value, + r#""}},"size":0}"#, + ] + .concat(); + + let es_api_endpoint = [ + es_url.as_ref(), + "/", + idx, + "/_search", + ] + .concat(); + let count = call_es_api(es_api_endpoint, Some(query.to_string())).await?; + + // extract the actual value from a struct like this + // { + // "took" : 652, + // "timed_out" : false, + // "_shards" : { + // "total" : 5, + // "successful" : 5, + // "skipped" : 0, + // "failed" : 0 + // }, + // "hits" : { + // "total" : { + // "value" : 0, + // "relation" : "eq" + // }, + // "max_score" : null, + // "hits" : [ ] + // } + // } + let count = match serde_json::from_value::(count) { + Ok(v) => v.hits.total.value, + Err(e) => { + error!( + "Failed to doc count response for idx:{}, field: {}, value: {} with {}", + idx, field, field_value, e + ); + return Err(()); + } + }; + + Ok(count) +} + +/// Executes multiple doc counts queries in parallel and returns the results in the same order. +/// Returns an error if any of the queries fail. +pub(crate) async fn matching_doc_counts( + es_url: &String, + idx: &String, + fields: Vec<&str>, + field_value: &String, + no_sql_string_invalidation_regex: &Regex, +) -> Result, ()> { + let mut futures: Vec<_> = Vec::new(); + + for field in fields { + futures.push(matching_doc_count(es_url, idx, field, field_value, no_sql_string_invalidation_regex)); + } + + // execute all searches in parallel and unwrap the results + let mut counts: Vec = Vec::new(); + for count in join_all(futures).await { + match count { + Err(_) => { + return Err(()); + } + Ok(v) => { + counts.push(v); + } + } + } + + Ok(counts) +} + +/// Returns up to 100 matching docs from DEV idx depending on the params. The query is built to match the list of params. +/// Lang and KW params are checked for No-SQL injection. +/// * langs: a tuple of the keyword and the min number of lines for it, e.g. ("rust",1000) +/// * timezone_offset: 0..23 where anything > 12 is the negative offset +/// * timezone_hours: number of hours worked in the timezone +/// * results_from: a pagination value to be passed onto ES +pub(crate) async fn matching_devs( + es_url: &String, + dev_idx: &String, + keywords: Vec, + langs: Vec<(String, usize)>, + timezone_offset: usize, + timezone_hours: usize, + results_from: usize, + no_sql_string_invalidation_regex: &Regex, +) -> Result { + // sample query + // {"size":100,"track_scores":true,"query":{"bool":{"must":[{"match":{"report.tech.language.keyword":"rust"}},{"multi_match":{"query":"logger","fields":["report.tech.pkgs_kw.k.keyword","report.tech.refs_kw.k.keyword"]}},{"multi_match":{"query":"clap","fields":["report.tech.pkgs_kw.k.keyword","report.tech.refs_kw.k.keyword"]}},{"multi_match":{"query":"serde","fields":["report.tech.pkgs_kw.k.keyword","report.tech.refs_kw.k.keyword"]}}]}},"sort":[{"hireable":{"order":"desc"}},{"report.timestamp":{"order":"desc"}}]} + + // a collector of must clauses + let mut must_clauses: Vec = Vec::new(); + + // build language clause + for lang in langs { + // validate field_value for possible no-sql injection + if no_sql_string_invalidation_regex.is_match(&lang.0) { + error!("Invalid lang: {}", lang.0); + return Err(()); + } + + // language clause is different from keywords clause + let clause = if lang.1 == 0 { + // a simple clause with no line counts + [r#"{"match":{"report.tech.language.keyword":""#, &lang.0, r#""}}"#].concat() + } else { + // LoC counts included in the query + [ + r#"{ + "nested": { + "path": "report.tech", + "query": { + "bool": { + "must": [ + { + "match": { + "report.tech.language.keyword": ""#, + &lang.0, + r#"" + } + }, + { + "range": { + "report.tech.code_lines": { + "gt": "#, + &lang.1.to_string(), + r#" + } + } + } + ] + } + } + } + }"#, + ] + .concat() + .replace(" ", "") + }; + + must_clauses.push(clause); + } + + // build keywords clauses + for keyword in keywords { + // validate field_value for possible no-sql injection + if no_sql_string_invalidation_regex.is_match(&keyword) { + error!("Invalid keyword: {}", keyword); + return Err(()); + } + + // query pkgs and refs if the name is qualified or pkgs_kw and refs_kw if it's not + let qual_unqual_clause = if keyword.contains(".") { + r#"","fields":["report.tech.pkgs.k.keyword","report.tech.refs.k.keyword"]}}"# + } else { + r#"","fields":["report.keywords.keyword"]}}"# + }; + + // using multimatch because different techs have keywords in different places + let clause = [r#"{"multi_match":{"query":""#, &keyword, qual_unqual_clause].concat(); + + must_clauses.push(clause); + } + + // add timezone part + if timezone_hours > 0 && timezone_hours <= 24 { + let timezone_offset = if timezone_offset > 9 { + ["h", &timezone_offset.to_string()].concat() + } else { + ["h0", &timezone_offset.to_string()].concat() + }; + + let clause = [ + r#"{"range":{"report.commit_time_histo.timezone_overlap_recent."#, + &timezone_offset, + r#"": {"gte": "#, + &timezone_hours.to_string(), + "}}}", + ] + .concat(); + + error!("TZ clause: {}", clause); + + must_clauses.push(clause); + } + + // combine the clauses + let clauses = must_clauses.join(","); + + // combine everything into a single query + let query = [ + r#"{"size":"#, + &Config::MAX_DEV_LISTINGS_PER_SEARCH_RESULT.to_string(), + r#","from": "#, + &results_from.to_string(), + r#","track_scores":true,"query":{"bool":{"must":["#, + &clauses, + r#"]}},"sort":[{"report.last_contributor_commit_date_epoch":{"order":"desc"}}]}"#, + ] + .concat(); + + // call the query + let es_api_endpoint = [es_url.as_ref(), "/", dev_idx, "/_search"].concat(); + let es_response = call_es_api(es_api_endpoint, Some(query.to_string())).await?; + + Ok(es_response) +} + +/// Search related keywords and packages by a partial keyword, up to 100 of each. +/// Returns a combined list of keyword/populary count for refs_kw and pkgs_kw sorted alphabetically. +/// The keyword is checked for validity ([^\-_0-9a-zA-Z]) before inserting into the regex query. +/// Returns an error if the keyword has any extra characters or the queries fail. +pub(crate) async fn related_keywords( + es_url: &String, + idx: &String, + keyword: &String, + regex_substring_invalidation: &Regex, +) -> Result, ()> { + // validate field_value for possible no-sql injection + if regex_substring_invalidation.is_match(&keyword) { + error!("Invalid keyword: {}", keyword); + return Err(()); + } + + // some keywords may contain #,. or -, which should be escaped in regex + // ES regex search is case sensitive, but the data is all in lower-case + // it is faster to make the KW lower case as well + let keyword_escaped = keyword + .to_lowercase() + .replace("#", r#"\\#"#) + .replace("#", r#"\\+"#) + .replace(".", r#"\\."#) + .replace("-", r#"\\-"#); + + // send a joined query to ES + let refs = r#"{"size":0,"aggregations":{"agg":{"terms":{"field":"report.tech.refs.k.keyword","size":50,"include":"(.*\\.)?%.*"}}}}"#; + let refs = refs.replace("%", &keyword_escaped); + let pkgs = r#"{"size":0,"aggregations":{"agg":{"terms":{"field":"report.tech.pkgs.k.keyword","size":50,"include":"(.*\\.)?%.*"}}}}"#; + let pkgs = pkgs.replace("%", &keyword_escaped); + + let (refs, pkgs) = join(search(es_url, idx, Some(&refs)), search(es_url, idx, Some(&pkgs))).await; + + // extract the data from JSON + let refs = match serde_json::from_value::(refs?) { + Err(e) => { + error!("Cannot deser refs with {}", e); + return Err(()); + } + Ok(v) => v, + }; + let pkgs = match serde_json::from_value::(pkgs?) { + Err(e) => { + error!("Cannot pkgs refs with {}", e); + return Err(()); + } + Ok(v) => v, + }; + + // extract refs into a hashmap + let mut related = refs + .aggregations + .agg + .buckets + .into_iter() + .map(|v| (v.key.to_lowercase(), v.doc_count)) + .collect::>(); + + // combine the refs counts with pkgs counts + for bucket in pkgs.aggregations.agg.buckets { + if let Some(doc_count) = related.get_mut(&bucket.key) { + *doc_count += bucket.doc_count; + } else { + related.insert(bucket.key, bucket.doc_count); + } + } + + // convert the combined hashmap into an array + let mut related = related + .into_iter() + .map(|v| (v.0, v.1)) + .collect::>(); + + // sort the result alphabetically + related.sort_by(|a, b| b.1.cmp(&a.1)); + + Ok(related) +} + +/// Reads the latest N entries from the specified stats index, e.g. stm_stats_dev_job_counts. +/// Returns the entire response as JSON Value. The index must follow a certain pattern +/// with the top element the same as the name of the query. Any other format will fail +/// at Tera transform. +/// ```json +/// { +/// "stm_stats_dev_job_counts" : { +/// "iso" : "2021-04-29T10:32:17.660423+00:00", +/// "ts" : 1619692338, +/// ... +/// } +/// } +/// ``` +/// The name of the IDX is included as a field in the query, but is NOT SANITIZED. +pub(crate) async fn get_stm_stats(es_url: &String, idx: &str, count: usize) -> Result { + // e.g. GET stm_stats_dev_job_counts/_search + let es_api_endpoint = [es_url.as_ref(), "/", idx, "/_search"].concat(); + + // insert the index name in the query + let query = [ + r#"{"size":"#, + count.to_string().as_str(), + r#","query":{"match_all":{}},"sort":[{""#, + idx, + r#".ts":{"order":"desc"}}]}"#, + ] + .concat(); + + let es_response = call_es_api(es_api_endpoint, Some(query)).await?; + + Ok(es_response) +} diff --git a/stm_graphql/src/handler.rs b/stm_graphql/src/handler.rs index 0f714d3..a3f573a 100644 --- a/stm_graphql/src/handler.rs +++ b/stm_graphql/src/handler.rs @@ -1,9 +1,11 @@ use crate::authorizer::validate_jwt; +use crate::handlers; use crate::types::{ApiGatewayRequest, ApiGatewayResponse}; use crate::Error; use lambda_runtime::LambdaEvent; use serde::Serialize; use serde_json::Value; +use simple_error::SimpleError; use std::collections::HashMap; use sysinfo::{RefreshKind, System, SystemExt}; use tracing::info; @@ -75,13 +77,16 @@ pub(crate) async fn my_handler(event: LambdaEvent) -> Result v, + Err(_) => { + return Err(Box::new(SimpleError::new("Failed to get GQL data"))); + } + }; info!("gql full: {} bytes", gql_data.len()); let gql_data = minify::html::minify(&gql_data); diff --git a/stm_graphql/src/handlers/home.rs b/stm_graphql/src/handlers/home.rs new file mode 100644 index 0000000..9ab647e --- /dev/null +++ b/stm_graphql/src/handlers/home.rs @@ -0,0 +1,19 @@ +use crate::config::Config; +use crate::elastic; +use stm_shared::elastic as elastic_shared; +use tracing::info; +use serde_json::Value; + +/// Returns the default home page +pub(crate) async fn language_stats(config: &Config) -> Result { + info!("Generating html-home"); + + // get number of devs per technology + // let stack_stats = + // elastic_shared::search(&config.es_url, &config.dev_idx, Some(elastic::SEARCH_ALL_LANGUAGES)).await?; + + + + //Ok(stack_stats) + Ok(String::new()) +} diff --git a/stm_graphql/src/handlers/mod.rs b/stm_graphql/src/handlers/mod.rs new file mode 100644 index 0000000..0865470 --- /dev/null +++ b/stm_graphql/src/handlers/mod.rs @@ -0,0 +1 @@ +pub(crate) mod home; \ No newline at end of file diff --git a/stm_graphql/src/main.rs b/stm_graphql/src/main.rs index a71b5f5..53756e4 100644 --- a/stm_graphql/src/main.rs +++ b/stm_graphql/src/main.rs @@ -7,6 +7,8 @@ mod config; mod handler; mod http_options; mod types; +mod elastic; +mod handlers; #[tokio::main] async fn main() -> Result<(), Error> { diff --git a/stm_html_ui/src/html/dev_profile.rs b/stm_html_ui/src/html/dev_profile.rs index 71d2385..bea1c62 100644 --- a/stm_html_ui/src/html/dev_profile.rs +++ b/stm_html_ui/src/html/dev_profile.rs @@ -1,6 +1,7 @@ use super::html_data::HtmlData; use crate::config::Config; use crate::elastic; +use serde_json::Value; use stm_shared::elastic as elastic_shared; use tracing::info; @@ -11,7 +12,7 @@ pub(crate) async fn html(config: &Config, owner_id: String, html_data: HtmlData) let query = elastic::add_param(elastic::SEARCH_DEV_BY_DOC_ID, owner_id.clone(), &config.no_sql_string_invalidation_regex); - let devs = elastic_shared::search(&config.es_url, &config.dev_idx, Some(&query)).await?; + let devs = elastic_shared::search::(&config.es_url, &config.dev_idx, Some(&query)).await?; // default response code let mut http_resp_code = 404_u32; diff --git a/stm_html_ui/src/html/gh_login_profile.rs b/stm_html_ui/src/html/gh_login_profile.rs index 6255421..8293ac4 100644 --- a/stm_html_ui/src/html/gh_login_profile.rs +++ b/stm_html_ui/src/html/gh_login_profile.rs @@ -1,6 +1,7 @@ use super::html_data::HtmlData; use crate::config::Config; use crate::elastic; +use serde_json::Value; use stm_shared::elastic as elastic_shared; use tracing::info; @@ -10,7 +11,7 @@ pub(crate) async fn html(config: &Config, login: String, html_data: HtmlData) -> let query = elastic::add_param(elastic::SEARCH_ENGINEER_BY_LOGIN, login.clone(), &config.no_sql_string_invalidation_regex); - let devs = elastic_shared::search(&config.es_url, &config.dev_idx, Some(query.as_str())).await?; + let devs = elastic_shared::search::(&config.es_url, &config.dev_idx, Some(query.as_str())).await?; // default response code let mut http_resp_code = 404_u32; diff --git a/stm_shared/src/elastic/mod.rs b/stm_shared/src/elastic/mod.rs index d6d18ea..f0cf4c4 100644 --- a/stm_shared/src/elastic/mod.rs +++ b/stm_shared/src/elastic/mod.rs @@ -2,11 +2,15 @@ use hyper::{header::HeaderValue, Body, Client, Request, Uri}; use hyper_rustls::HttpsConnectorBuilder; use regex::Regex; -use serde::Serialize; +use serde::{de::DeserializeOwned, Serialize}; use serde_json::Value; use tracing::{debug, error, info}; pub mod types; +pub mod types_aggregations; +pub mod types_search_log; +pub mod types_source; +pub mod types_hits; /// A generic function for making signed(v4) API calls to AWS ES. /// `es_api_endpoint` must be a fully qualified URL, e.g. https://x.ap-southeast-2.es.amazonaws.com/my_index/_search @@ -115,7 +119,7 @@ pub async fn upload_object_to_es( /// A generic function for making signed(v4) API calls to AWS ES. /// `es_api_endpoint` must be a fully qualified URL, e.g. https://x.ap-southeast-2.es.amazonaws.com/my_index/_search -pub async fn call_es_api(es_api_endpoint: String, payload: Option) -> Result { +pub async fn call_es_api(es_api_endpoint: String, payload: Option) -> Result { // prepare METHOD and the payload in one step let (method, payload_id, payload) = match payload { None => ("GET", 0usize, Body::empty()), @@ -177,7 +181,7 @@ pub async fn call_es_api(es_api_endpoint: String, payload: Option) -> Re } // all responses should be JSON. If it's not JSON it's an error. - let output = Ok(serde_json::from_slice::(&buf).expect("Failed to convert ES resp to JSON")); + let output = Ok(serde_json::from_slice::(&buf).expect("Failed to convert ES resp to a type")); info!("ES query {} deserialized", payload_id); //info!("{}", output.as_ref().unwrap()); // for debugging output @@ -221,12 +225,12 @@ pub async fn get_doc_by_id( /// * es_url: elastucsearch url /// * idx: ES index name /// * query: the query text, if any for *_search* or `None` for *_count* -pub async fn search(es_url: &String, idx: &String, query: Option<&str>) -> Result { +pub async fn search(es_url: &String, idx: &String, query: Option<&str>) -> Result { if query.is_some() { let es_api_endpoint = [es_url.as_ref(), "/", idx, "/_search"].concat(); - return call_es_api(es_api_endpoint, Some(query.unwrap().to_string())).await; + return call_es_api::(es_api_endpoint, Some(query.unwrap().to_string())).await; } else { let es_api_endpoint = [es_url.as_ref(), "/", idx, "/_count"].concat(); - return call_es_api(es_api_endpoint, None).await; + return call_es_api::(es_api_endpoint, None).await; } } diff --git a/stm_shared/src/elastic/types.rs b/stm_shared/src/elastic/types.rs index bde4b4b..b51c0c8 100644 --- a/stm_shared/src/elastic/types.rs +++ b/stm_shared/src/elastic/types.rs @@ -1,21 +1,9 @@ -use serde::{Deserialize, Serialize}; -use std::collections::hash_map::DefaultHasher; -use std::hash::{Hash, Hasher}; +use super::types_hits::ESHitsCountHits; +use super::types_source::ESHits; +use serde::Deserialize; -// HITS WRAPPER ************************************************************************************** - -/// An inner member -#[derive(Deserialize, Debug)] -pub struct ESSourceSource { - #[serde(rename(deserialize = "_source"))] - pub source: T, -} - -/// An inner member -#[derive(Deserialize, Debug)] -pub struct ESHits { - pub hits: Vec>, -} +pub use super::types_aggregations::ESAggs; +pub use super::types_search_log::SearchLog; /// A generic wrapper to get to any type of _source in ES response. E.g. /// ```json @@ -58,18 +46,6 @@ pub struct ESReportTimestamp { pub report: ESReportTimestampTimestamp, } -/// Member of ESHitsCount -#[derive(Deserialize)] -pub struct ESHitsCountTotals { - pub value: usize, -} - -/// Member of ESHitsCount -#[derive(Deserialize)] -pub struct ESHitsCountHits { - pub total: ESHitsCountTotals, -} - /// Corresponds to ES response metadata /// ```json /// { @@ -96,45 +72,6 @@ pub struct ESHitsCount { pub hits: ESHitsCountHits, } -/// Part of ESAggs -#[derive(Deserialize)] -pub struct ESAggsBucket { - pub key: String, - pub doc_count: usize, -} - -/// Part of ESAggs -#[derive(Deserialize)] -pub struct ESAggsBuckets { - pub buckets: Vec, -} - -/// Part of ESAggs -#[derive(Deserialize)] -pub struct ESAggsAgg { - pub agg: ESAggsBuckets, -} - -/// A generic structure for ES aggregations result. Make sure the aggregation name is `aggs`. -/// ```json -/// { -/// "aggregations" : { -/// "agg" : { -/// "buckets" : [ -/// { -/// "key" : "twilio", -/// "doc_count" : 597 -/// } -/// ] -/// } -/// } -/// } -/// ``` -#[derive(Deserialize)] -pub struct ESAggs { - pub aggregations: ESAggsAgg, -} - /// Top level contents of _source /// To be used as for ESHits. /// ```json @@ -171,49 +108,3 @@ pub struct ESSourceDev { /// A free-text location from GH pub location: Option, } - -// --------------------------------------------------------------------- -/// A container for a search results log entry -#[derive(Serialize, Deserialize)] -pub struct SearchLog { - /// The raw search string as entered by the user - pub raw: String, - /// Same as availability_tz in html_data - #[serde(skip_serializing_if = "Option::is_none")] - pub availability_tz: Option, - /// Same as availability_tz_hrs in html_data - #[serde(skip_serializing_if = "Option::is_none")] - pub availability_tz_hrs: Option, - /// List of keywords extracted from the raw search - pub kw: Vec, - /// A list of search terms matching known languages - pub lang: Vec, - /// Page number of the request, defaults to 1 - #[serde(default)] - pub page_num: usize, - /// Source IP address - pub ip: Option, - /// EPOCH of the timestamp - pub ts: i64, - /// Duration of the request in ms - pub dur: i64, - /// List of GH logins found in the response - pub gh_logins: Vec, -} - -impl Hash for SearchLog { - fn hash(&self, state: &mut H) { - self.raw.hash(state); - self.ip.hash(state); - self.ts.hash(state); - } -} - -impl SearchLog { - /// Returns a hash of the object as u64 converted to string - pub fn get_hash(&self) -> String { - let mut hasher = DefaultHasher::new(); - self.hash(&mut hasher); - hasher.finish().to_string() - } -} diff --git a/stm_shared/src/elastic/types_aggregations.rs b/stm_shared/src/elastic/types_aggregations.rs new file mode 100644 index 0000000..3c39269 --- /dev/null +++ b/stm_shared/src/elastic/types_aggregations.rs @@ -0,0 +1,40 @@ +use serde::Deserialize; + +/// A generic structure for ES aggregations result. Make sure the aggregation name is `agg`. +/// ```json +/// { +/// "aggregations" : { +/// "agg" : { +/// "buckets" : [ +/// { +/// "key" : "twilio", +/// "doc_count" : 597 +/// } +/// ] +/// } +/// } +/// } +/// ``` +#[derive(Deserialize)] +pub struct ESAggs { + pub aggregations: ESAggsAgg, +} + +/// Part of ESAggs +#[derive(Deserialize)] +pub struct ESAggsBucket { + pub key: String, + pub doc_count: usize, +} + +/// Part of ESAggs +#[derive(Deserialize)] +pub struct ESAggsBuckets { + pub buckets: Vec, +} + +/// Part of ESAggs +#[derive(Deserialize)] +pub struct ESAggsAgg { + pub agg: ESAggsBuckets, +} diff --git a/stm_shared/src/elastic/types_hits.rs b/stm_shared/src/elastic/types_hits.rs new file mode 100644 index 0000000..33d3ab3 --- /dev/null +++ b/stm_shared/src/elastic/types_hits.rs @@ -0,0 +1,13 @@ +use serde::Deserialize; + +/// Member of ESHitsCount +#[derive(Deserialize)] +pub struct ESHitsCountTotals { + pub value: usize, +} + +/// Member of ESHitsCount +#[derive(Deserialize)] +pub struct ESHitsCountHits { + pub total: ESHitsCountTotals, +} diff --git a/stm_shared/src/elastic/types_search_log.rs b/stm_shared/src/elastic/types_search_log.rs new file mode 100644 index 0000000..fb0fa33 --- /dev/null +++ b/stm_shared/src/elastic/types_search_log.rs @@ -0,0 +1,48 @@ +use serde::{Deserialize, Serialize}; +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; + +/// A container for a search results log entry +#[derive(Serialize, Deserialize)] +pub struct SearchLog { + /// The raw search string as entered by the user + pub raw: String, + /// Same as availability_tz in html_data + #[serde(skip_serializing_if = "Option::is_none")] + pub availability_tz: Option, + /// Same as availability_tz_hrs in html_data + #[serde(skip_serializing_if = "Option::is_none")] + pub availability_tz_hrs: Option, + /// List of keywords extracted from the raw search + pub kw: Vec, + /// A list of search terms matching known languages + pub lang: Vec, + /// Page number of the request, defaults to 1 + #[serde(default)] + pub page_num: usize, + /// Source IP address + pub ip: Option, + /// EPOCH of the timestamp + pub ts: i64, + /// Duration of the request in ms + pub dur: i64, + /// List of GH logins found in the response + pub gh_logins: Vec, +} + +impl Hash for SearchLog { + fn hash(&self, state: &mut H) { + self.raw.hash(state); + self.ip.hash(state); + self.ts.hash(state); + } +} + +impl SearchLog { + /// Returns a hash of the object as u64 converted to string + pub fn get_hash(&self) -> String { + let mut hasher = DefaultHasher::new(); + self.hash(&mut hasher); + hasher.finish().to_string() + } +} diff --git a/stm_shared/src/elastic/types_source.rs b/stm_shared/src/elastic/types_source.rs new file mode 100644 index 0000000..636529c --- /dev/null +++ b/stm_shared/src/elastic/types_source.rs @@ -0,0 +1,116 @@ +use serde::Deserialize; + +// HITS WRAPPER ************************************************************************************** + +/// An inner member +#[derive(Deserialize, Debug)] +pub struct ESSourceSource { + #[serde(rename(deserialize = "_source"))] + pub source: T, +} + +/// An inner member +#[derive(Deserialize, Debug)] +pub struct ESHits { + pub hits: Vec>, + // there should be member `total` with the counts from ESHitsCountHits + // e.g. "total" : { + // "value" : 1, + // "relation" : "eq" + // }, +} + +// MISC REPORT FIELDS ************************************************************************** + +/// An inner member +#[derive(Deserialize, Debug)] +pub struct ESReportTimestampTimestamp { + pub timestamp: String, +} + +/// Contains several levels to get to the report's timestamp. +/// To be used as for ESHits. +/// ```json +///"report" : { +/// "timestamp" : "2021-03-08T20:11:05.862454103+00:00" +///} +/// ``` +#[derive(Deserialize, Debug)] +pub struct ESReportTimestamp { + pub report: ESReportTimestampTimestamp, +} + +/// Member of ESHitsCount +#[derive(Deserialize)] +pub struct ESHitsCountTotals { + pub value: usize, +} + +/// Member of ESHitsCount +#[derive(Deserialize)] +pub struct ESHitsCountHits { + pub total: ESHitsCountTotals, +} + +/// Corresponds to ES response metadata +/// ```json +/// { +/// "took" : 652, +/// "timed_out" : false, +/// "_shards" : { +/// "total" : 5, +/// "successful" : 5, +/// "skipped" : 0, +/// "failed" : 0 +/// }, +/// "hits" : { +/// "total" : { +/// "value" : 0, +/// "relation" : "eq" +/// }, +/// "max_score" : null, +/// "hits" : [ ] +/// } +/// } +/// ``` +#[derive(Deserialize)] +pub struct ESHitsCount { + pub hits: ESHitsCountHits, +} + +/// Top level contents of _source +/// To be used as for ESHits. +/// ```json +/// "_source" : { +/// "login" : "MarkStefanovic", +/// "id" : 13571999, +/// "node_id" : "MDQ6VXNlcjEzNTcxOTk5", +/// "avatar_url" : "https://avatars.githubusercontent.com/u/13571999?v=4", +/// "name" : "Mark Stefanovic", +/// "company" : null, +/// "blog" : "", +/// "location" : "US", +/// "email" : null, +/// "hireable" : null, +/// "bio" : null, +/// "twitter_username" : null, +/// "public_repos" : 18, +/// "public_gists" : 0, +/// "followers" : 2, +/// "following" : 0, +/// "created_at" : "2015-07-30T12:56:48Z", +/// "updated_at" : "2021-07-13T10:29:00Z", +///} +/// ``` +#[derive(Deserialize, Debug)] +pub struct ESSourceDev { + /// GH login + pub login: Option, + /// An ISO3389 timestamp of when the gh a/c was created (from GH) + /// e.g. 2013-11-13T05:06:37Z + pub created_at: Option, + /// Public email address from GH + pub email: Option, + /// A free-text location from GH + pub location: Option, +}