diff --git a/.github/workflows/cla.yaml b/.github/workflows/cla.yaml index 8b1171961..a4599cf8a 100644 --- a/.github/workflows/cla.yaml +++ b/.github/workflows/cla.yaml @@ -41,5 +41,6 @@ jobs: #custom-notsigned-prcomment: 'pull request comment with Introductory message to ask new contributors to sign' #custom-pr-sign-comment: 'The signature to be committed in order to sign the CLA' #custom-allsigned-prcomment: 'pull request comment when all contributors has signed, defaults to **CLA Assistant Lite bot** All Contributors have signed the CLA.' - #lock-pullrequest-aftermerge: false - if you don't want this bot to automatically lock the pull request after merging (default - true) + #- if you don't want this bot to automatically lock the pull request after merging (default - true) + lock-pullrequest-aftermerge: false #use-dco-flag: true - If you are using DCO instead of CLA diff --git a/.gitignore b/.gitignore index c3994ec35..bf69b23aa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ target -data -staging +data* +staging* limitcache examples cert.pem diff --git a/CNAME b/CNAME index a171e3429..acb13d02f 100644 --- a/CNAME +++ b/CNAME @@ -1 +1 @@ -charts.parseable.io \ No newline at end of file +charts.parseable.com diff --git a/Cargo.lock b/Cargo.lock index e293a0663..61117c66a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1490,9 +1490,9 @@ dependencies = [ [[package]] name = "digest" -version = "0.10.6" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", @@ -1750,9 +1750,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.3.17" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66b91535aa35fea1523ad1b86cb6b53c28e0ae566ba4a460f4457e936cad7c6f" +checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" dependencies = [ "bytes", "fnv", @@ -1760,7 +1760,7 @@ dependencies = [ "futures-sink", "futures-util", "http", - "indexmap 1.9.2", + "indexmap 2.0.1", "slab", "tokio", "tokio-util", @@ -1839,6 +1839,15 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "hostname" version = "0.3.1" @@ -2076,7 +2085,7 @@ checksum = "22e18b0a45d56fe973d6db23972bf5bc46f988a4a2385deac9cc29572f09daef" dependencies = [ "hermit-abi 0.3.1", "io-lifetimes", - "rustix", + "rustix 0.36.16", "windows-sys 0.45.0", ] @@ -2098,6 +2107,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "0.4.8" @@ -2231,6 +2249,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +[[package]] +name = "linux-raw-sys" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" + [[package]] name = "local-channel" version = "0.1.3" @@ -2383,9 +2407,9 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.8" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" dependencies = [ "libc", "log", @@ -2393,6 +2417,12 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "multimap" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" + [[package]] name = "nom" version = "7.1.3" @@ -2660,7 +2690,7 @@ dependencies = [ [[package]] name = "parseable" -version = "0.7.3" +version = "1.0.0" dependencies = [ "actix-cors", "actix-web", @@ -2713,6 +2743,9 @@ dependencies = [ "parquet", "path-clean", "prometheus", + "prometheus-parse", + "prost", + "prost-build", "rand", "regex", "relative-path", @@ -2725,6 +2758,7 @@ dependencies = [ "serde_json", "serde_repr", "sha1_smol", + "sha2", "static-files", "sysinfo", "thiserror", @@ -2880,6 +2914,16 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "prettyplease" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae005bd773ab59b4725093fd7df83fd7892f7d8eafb48dbd7de6e024e4215f9d" +dependencies = [ + "proc-macro2", + "syn 2.0.37", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -2929,7 +2973,7 @@ dependencies = [ "byteorder", "hex", "lazy_static", - "rustix", + "rustix 0.36.16", ] [[package]] @@ -2949,21 +2993,55 @@ dependencies = [ "thiserror", ] +[[package]] +name = "prometheus-parse" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "811031bea65e5a401fb2e1f37d802cca6601e204ac463809a3189352d13b78a5" +dependencies = [ + "chrono", + "itertools 0.12.1", + "once_cell", + "regex", +] + [[package]] name = "prost" -version = "0.12.1" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4fdd22f3b9c31b53c060df4a0613a1c7f062d4115a2b984dd15b1858f7e340d" +checksum = "146c289cda302b98a28d40c8b3b90498d6e526dd24ac2ecea73e4e491685b94a" dependencies = [ "bytes", "prost-derive", ] +[[package]] +name = "prost-build" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c55e02e35260070b6f716a2423c2ff1c3bb1642ddca6f99e1f26d06268a0e2d2" +dependencies = [ + "bytes", + "heck", + "itertools 0.11.0", + "log", + "multimap", + "once_cell", + "petgraph", + "prettyplease", + "prost", + "prost-types", + "regex", + "syn 2.0.37", + "tempfile", + "which", +] + [[package]] name = "prost-derive" -version = "0.12.1" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "265baba7fabd416cf5078179f7d2cbeca4ce7a9041111900675ea7c4cb8a4c32" +checksum = "efb6c9a1dd1def8e2124d17e83a20af56f1570d6c2d2bd9e266ccb768df3840e" dependencies = [ "anyhow", "itertools 0.11.0", @@ -2972,6 +3050,15 @@ dependencies = [ "syn 2.0.37", ] +[[package]] +name = "prost-types" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "193898f59edcf43c26227dcd4c8427f00d99d61e95dcde58dabd49fa291d470e" +dependencies = [ + "prost", +] + [[package]] name = "protobuf" version = "2.28.0" @@ -3245,10 +3332,23 @@ dependencies = [ "errno", "io-lifetimes", "libc", - "linux-raw-sys", + "linux-raw-sys 0.1.4", "windows-sys 0.45.0", ] +[[package]] +name = "rustix" +version = "0.38.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc99bc2d4f1fed22595588a013687477aedf3cdcfb26558c559edb67b4d9b22e" +dependencies = [ + "bitflags 2.4.0", + "errno", + "libc", + "linux-raw-sys 0.4.13", + "windows-sys 0.48.0", +] + [[package]] name = "rustls" version = "0.20.8" @@ -3430,9 +3530,9 @@ checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012" [[package]] name = "sha2" -version = "0.10.6" +version = "0.10.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82e6b795fe2e3b1e845bafcb27aa35405c4d47cdfc92af5fc8d3002f76cebdc0" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" dependencies = [ "cfg-if", "cpufeatures", @@ -4390,6 +4490,18 @@ dependencies = [ "webpki", ] +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix 0.38.25", +] + [[package]] name = "winapi" version = "0.3.9" @@ -4452,6 +4564,15 @@ dependencies = [ "windows-targets 0.48.1", ] +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.0", +] + [[package]] name = "windows-targets" version = "0.42.1" @@ -4482,6 +4603,21 @@ dependencies = [ "windows_x86_64_msvc 0.48.0", ] +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm 0.52.0", + "windows_aarch64_msvc 0.52.0", + "windows_i686_gnu 0.52.0", + "windows_i686_msvc 0.52.0", + "windows_x86_64_gnu 0.52.0", + "windows_x86_64_gnullvm 0.52.0", + "windows_x86_64_msvc 0.52.0", +] + [[package]] name = "windows_aarch64_gnullvm" version = "0.42.1" @@ -4494,6 +4630,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + [[package]] name = "windows_aarch64_msvc" version = "0.39.0" @@ -4512,6 +4654,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + [[package]] name = "windows_i686_gnu" version = "0.39.0" @@ -4530,6 +4678,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + [[package]] name = "windows_i686_msvc" version = "0.39.0" @@ -4548,6 +4702,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + [[package]] name = "windows_x86_64_gnu" version = "0.39.0" @@ -4566,6 +4726,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + [[package]] name = "windows_x86_64_gnullvm" version = "0.42.1" @@ -4578,6 +4744,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + [[package]] name = "windows_x86_64_msvc" version = "0.39.0" @@ -4596,6 +4768,12 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" + [[package]] name = "winreg" version = "0.10.1" diff --git a/Dockerfile b/Dockerfile index b61b03831..b03ae1a42 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,7 +14,7 @@ # along with this program. If not, see . # build stage -FROM rust:1.73.0-bookworm as builder +FROM rust:1.77.1-bookworm as builder LABEL org.opencontainers.image.title="Parseable" LABEL maintainer="Parseable Team " diff --git a/Dockerfile.debug b/Dockerfile.debug index 2bed98e79..f7112ef22 100644 --- a/Dockerfile.debug +++ b/Dockerfile.debug @@ -14,7 +14,7 @@ # along with this program. If not, see . # build stage -FROM rust:1.73.0-bookworm as builder +FROM rust:1.77.1-bookworm as builder LABEL org.opencontainers.image.title="Parseable" LABEL maintainer="Parseable Team " diff --git a/README.md b/README.md index 737ead69a..1a399f963 100644 --- a/README.md +++ b/README.md @@ -5,35 +5,33 @@ Parseable
- Cloud native log analytics + Log Lake for the cloud-native world
[![Docker Pulls](https://img.shields.io/docker/pulls/parseable/parseable?logo=docker&label=Docker%20Pulls)](https://hub.docker.com/r/parseable/parseable) -[![Slack](https://img.shields.io/badge/slack-brightgreen.svg?logo=slack&label=Community&style=flat&color=%2373DC8C&)](https://launchpass.com/parseable) -[![Docs](https://img.shields.io/badge/stable%20docs-parseable.io%2Fdocs-brightgreen?style=flat&color=%2373DC8C&label=Docs)](https://www.parseable.io/docs) +[![Slack](https://img.shields.io/badge/slack-brightgreen.svg?logo=slack&label=Community&style=flat&color=%2373DC8C&)](https://logg.ing/community) +[![Docs](https://img.shields.io/badge/stable%20docs-parseable.io%2Fdocs-brightgreen?style=flat&color=%2373DC8C&label=Docs)](https://logg.ing/docs) [![Build](https://img.shields.io/github/checks-status/parseablehq/parseable/main?style=flat&color=%2373DC8C&label=Checks)](https://github.com/parseablehq/parseable/actions) [Key Concepts](https://www.parseable.io/docs/concepts) | [Features](https://github.com/parseablehq/parseable#rocket-highlights) | [Documentation](https://www.parseable.io/docs) | [Demo](https://demo.parseable.com/login?q=eyJ1c2VybmFtZSI6ImFkbWluIiwicGFzc3dvcmQiOiJhZG1pbiJ9) | [Integrations](https://www.parseable.io/docs/category/integrations) | [FAQ](https://www.parseable.io/docs/faq)
-Parseable is a log analytics platform, built for the modern, cloud native era. Parseable uses a index-free mechanism to organize and query data allowing low latency, and high throughput ingestion and query. +Parseable is a **cloud native, log analytics platform, with a focus on performance & resource efficiency**. Parseable is useful for use cases where **complete data ownership, security and privacy are paramount**. -To get started, download the Parseable binary from [releases page ↗︎](https://github.com/parseablehq/parseable/releases/latest) and run it on your machine. +To experience Parseable UI, checkout [demo.parseable.com ↗︎](https://demo.parseable.com/login?q=eyJ1c2VybmFtZSI6ImFkbWluIiwicGFzc3dvcmQiOiJhZG1pbiJ9). You can also view the [demo video ↗︎](https://www.parseable.com/video.mp4). -For comparison, Parseable consumes up to **_~80% lower memory_** and **_~50% lower CPU_** than Elastic for similar ingestion throughput. Read more in the [benchmarks directory ↗︎](./benchmarks/). +## QuickStart :zap: -For :stethoscope: commercial support and consultation, please reach out to us at [`sales@parseable.io` ↗︎](mailto:sales@parseable.io). +
+Docker Image +

-![Parseable Console](https://raw.githubusercontent.com/parseablehq/.github/main/images/console.png) +You can get started with Parseable Docker with a simple Docker run and then send data via cURL to understand how you can ingest data to Parseable. Below is the command to run Parseable in local storage mode with Docker. -## :zap: Quickstart - -Deploy Parseable in local storage mode with Docker. - -```sh +```bash docker run -p 8000:8000 \ parseable/parseable:latest \ parseable local-store @@ -43,7 +41,65 @@ Once this runs successfully, you'll see dashboard at [http://localhost:8000 ↗ To ingest data, run the below command. This will send logs to the `demo` stream. You can see the logs in the dashboard. -```sh +```bash +curl --location --request POST 'http://localhost:8000/api/v1/ingest' \ +--header 'X-P-Stream: demo' \ +--header 'Authorization: Basic YWRtaW46YWRtaW4=' \ +--header 'Content-Type: application/json' \ +--data-raw '[ + { + "id": "434a5f5e-2f5f-11ed-a261-0242ac120002", + "datetime": "24/Jun/2022:14:12:15 +0000", + "host": "153.10.110.81" + } +]' +``` + +

+
+ +
+Executable Binary +

+ +You can download and run the Parseable binary on your laptop. + +- Linux + +```bash +wget https://github.com/parseablehq/parseable/releases/download/v0.9.0/Parseable_x86_64-unknown-linux-gnu -O parseable +chmod +x parseable +./parseable local-store +``` + +- MacOS (Apple Silicon) + +```bash +wget https://github.com/parseablehq/parseable/releases/download/v0.9.0/Parseable_aarch64-apple-darwin -O parseable +chmod +x parseable +./parseable local-store +``` + +- MacOS (Intel) + +```bash +wget https://github.com/parseablehq/parseable/releases/download/v0.9.0/Parseable_x86_64-apple-darwin -O parseable +chmod +x parseable +./parseable local-store +``` + +- Windows + +```bash +Invoke-WebRequest -Uri "https://github.com/parseablehq/parseable/releases/download/v0.9.0/Parseable_x86_64-pc-windows-msvc.exe" -OutFile "C:\parseable.exe" +C:\parseable.exe local-store +``` + +Once this runs successfully, you'll see dashboard at [http://localhost:8000 ↗︎](http://localhost:8000). You can login to the dashboard default credentials `admin`, `admin`. + +To ingest data, run the below command. This will send logs to the `demo` stream. You can see the logs in the dashboard. + +```bash curl --location --request POST 'http://localhost:8000/api/v1/ingest' \ --header 'X-P-Stream: demo' \ --header 'Authorization: Basic YWRtaW46YWRtaW4=' \ @@ -57,38 +113,56 @@ curl --location --request POST 'http://localhost:8000/api/v1/ingest' \ ]' ``` -## :rocket: Highlights +

+
+ +## Why Parseable :question: -- Choose storage backend - local drive or S3 (or compatible) object store. -- Ingestion API compatible with HTTP + JSON output of log agents. -- Query log data with PostgreSQL compatible SQL. -- Single binary includes all components - ingestion, store and query. Built-in UI. +### Performance & resource efficiency + +Parseable is written in Rust, with a clear focus on performance while ensuring a much lower CPU and memory footprint (compared to Java, Go based systems). When compared with Elastic, Parseable uses ~80% lesser memory and ~50% lesser CPU, while offering a better ingestion rate. This means you can run Parseable on smaller instances, saving costs. + +### Easy of use + +One of the key challenges users said they face today is the complexity of setting a logging system like Elastic. There are so many moving parts, and it's hard to get started. Parseable is designed to be simple to use, with a single binary that can be run on almost anywhere. The Console is built in the binary itself, so you can start using it without any additional setup. + +### Take control of your data + +With Apache Arrow and Apache Parquet as the underlying data formats, Parseable stores log data in an optimized, compressed manner as Parquet files. This means you get complete control and access to your data. You can use Parseable query and analysis, but also can plugin tools from wider Parquet ecosystem for further processing, analysis, and visualization. ### Enterprise ready +- High availability & Cluster mode +- Local cache & storage +- [OpenTelemetry support ↗︎](https://opentelemetry.io/) - [Alerts ↗︎](https://www.parseable.io/docs/alerts) -- [RBAC ↗︎](https://www.parseable.io/docs/rbac) -- [OAuth2 ↗︎](https://www.parseable.io/docs/oidc) -- [Grafana ↗︎](https://github.com/parseablehq/parseable-datasource) +- [Role based access control ↗︎](https://www.parseable.io/docs/rbac) +- [OAuth2 support ↗︎](https://www.parseable.io/docs/oidc) +- [Grafana based visualization ↗︎](https://github.com/parseablehq/parseable-datasource) - [LLM ↗︎](https://www.parseable.io/docs/llm) - [Stats ↗︎](https://www.postman.com/parseable/workspace/parseable/request/22353706-b32abe55-f0c4-4ed2-9add-110d265888c3) -## :dart: Motivation +## How do people use Parseable :bulb: + +- **Audit & Compliance** - Organizations that need to store logs in a secure, compliant manner. Parseable's direct to S3 bucket storage mode ensures that logs are stored in a secure, cost effective manner, and can be accessed only by authorized users, while all the data is queryable in real-time. + +- **Observability & Monitoring** - A very large chunk of observability data is logs. Organizations that need to monitor their systems, applications, and infrastructure in real-time use Parseable as the primary log storage system so they get timely alerts, and can analyze logs in real-time. + +- **Log Analytics** - Not all logs are created equal. For example application logs are seldom useful after a few days pass, but if same application also logs all the user interactions, that data is very valuable for product managers, and can be stored for a longer period. Several businesses store such high value logs and slice / dice them as needed. + +## Motivation :dart: Traditionally, logging has been seen as a text search problem. Log volumes were not high, and data ingestion or storage were not really issues. This led us to today, where all the logging platforms are primarily text search engines. But with log data growing exponentially, today's log data challenges involve whole lot more – Data ingestion, storage, and observation, all at scale. We are building Parseable to address these challenges. -## :trophy: Contributing +## Contributing :trophy: [Contribution guide ↗︎](https://www.parseable.io/docs/contributing). -![Alt](https://repobeats.axiom.co/api/embed/7c4e0f51cd3b8f78d1da682c396a3b5bd855a6ba.svg "Repobeats analytics image") - -### Contributors - ### Supported by + diff --git a/USERS.md b/USERS.md index c158012da..ec82da99b 100644 --- a/USERS.md +++ b/USERS.md @@ -2,7 +2,7 @@ The following document is a list of users and adopters who use Parseable. The users themselves directly maintain the list. You can add your organization by editing this file directly. -If you're using Parseable in your organization, please add your company name to this list. It really helps the project gain momentum and credibility. It's a small contribution to the project with a big impact. +If you're using Parseable in your organization, please add your company name to this list. It really helps the project gain momentum and credibility. It's a small contribution to the project with a big impact. --- diff --git a/about.hbs b/about.hbs new file mode 100644 index 000000000..76a6b37c1 --- /dev/null +++ b/about.hbs @@ -0,0 +1,70 @@ + + + + + + + +
+
+

Third Party Licenses used in Parseable

+

This page lists the licenses of the projects used in Parseable.

+
+ +

Overview of licenses:

+
    + {{#each overview}} +
  • {{name}} ({{count}})
  • + {{/each}} +
+ +

All license text:

+ +
+ + + diff --git a/about.toml b/about.toml new file mode 100644 index 000000000..5c0c65fe7 --- /dev/null +++ b/about.toml @@ -0,0 +1,19 @@ +accepted = [ + "Apache-2.0", + "MIT", + "BSD-3-Clause", + "ISC", + "MPL-2.0", + "NOASSERTION", + "AGPL-3.0", + "GPL-3.0", + "BSD-2-Clause", + "BSL-1.0", + "OpenSSL", + "Unicode-DFS-2016", +] + +workarounds = [ + "ring", + "rustls", +] diff --git a/helm-reindex.sh b/helm-reindex.sh index 861fe9c32..0511e07fe 100755 --- a/helm-reindex.sh +++ b/helm-reindex.sh @@ -3,4 +3,4 @@ helm package helm -d helm-releases/ helm package ../operator/helm/operator -d helm-releases/ -helm repo index --merge index.yaml --url https://charts.parseable.io . +helm repo index --merge index.yaml --url https://charts.parseable.com . diff --git a/helm-releases/operator-0.0.3.tgz b/helm-releases/operator-0.0.3.tgz index 8ee38369a..dd4f746b9 100644 Binary files a/helm-releases/operator-0.0.3.tgz and b/helm-releases/operator-0.0.3.tgz differ diff --git a/helm-releases/parseable-0.7.3.tgz b/helm-releases/parseable-0.7.3.tgz index b06b29ca6..deb18bcf9 100644 Binary files a/helm-releases/parseable-0.7.3.tgz and b/helm-releases/parseable-0.7.3.tgz differ diff --git a/helm-releases/parseable-0.8.0.tgz b/helm-releases/parseable-0.8.0.tgz new file mode 100644 index 000000000..49ddece31 Binary files /dev/null and b/helm-releases/parseable-0.8.0.tgz differ diff --git a/helm-releases/parseable-0.8.1.tgz b/helm-releases/parseable-0.8.1.tgz new file mode 100644 index 000000000..f6e3632f1 Binary files /dev/null and b/helm-releases/parseable-0.8.1.tgz differ diff --git a/helm-releases/parseable-0.9.0.tgz b/helm-releases/parseable-0.9.0.tgz new file mode 100644 index 000000000..b82d61276 Binary files /dev/null and b/helm-releases/parseable-0.9.0.tgz differ diff --git a/helm/Chart.yaml b/helm/Chart.yaml index 1db39ddce..48783b96d 100644 --- a/helm/Chart.yaml +++ b/helm/Chart.yaml @@ -2,12 +2,12 @@ apiVersion: v2 name: parseable description: Helm chart for Parseable Server type: application -version: 0.7.3 -appVersion: "v0.7.3" +version: 0.9.0 +appVersion: "v0.9.0" maintainers: - name: Parseable Team - email: hi@parseable.io - url: https://parseable.io + email: hi@parseable.com + url: https://parseable.com dependencies: - name: vector diff --git a/helm/values.yaml b/helm/values.yaml index 55fec4372..5d64b3d9f 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -1,7 +1,7 @@ parseable: image: repository: parseable/parseable - tag: v0.7.3 + tag: v0.9.0 pullPolicy: Always replicaCount: 1 ## Set to true if you want to deploy Parseable in local mode (store logs diff --git a/index.yaml b/index.yaml index f27dfd62c..ad227aea0 100644 --- a/index.yaml +++ b/index.yaml @@ -1,51 +1,109 @@ apiVersion: v1 entries: - collector: - - apiVersion: v2 - appVersion: 4769fbf - created: "2023-04-16T15:34:14.307979+05:30" - description: Helm chart for Parseable Collector - digest: 5ff2078d91a7ad1265678f027e71bc84a3eec96b957f613f11bec2745040db42 - name: collector - type: application - urls: - - https://charts.parseable.io/helm-releases/collector-0.0.1.tgz - version: 0.0.1 operator: - apiVersion: v2 appVersion: v0.0.3 - created: "2024-01-04T13:22:39.339343045+05:30" + created: "2024-03-04T21:18:31.578535938+05:30" description: A Helm chart for Parseable Operator - digest: 0aeb294b82dd30a405e19023c4a3175fc85caf042660bafc03309e7c883f10bb + digest: 0279d9d34bb8cab5160647eb1bd1d97fcd09aff6dcce315b58a62f5295bd9eb6 name: operator type: application urls: - - https://charts.parseable.io/helm-releases/operator-0.0.3.tgz + - https://charts.parseable.com/helm-releases/operator-0.0.3.tgz version: 0.0.3 - apiVersion: v2 appVersion: v0.0.2 - created: "2024-01-04T13:22:39.33757159+05:30" + created: "2024-03-04T21:18:31.577441663+05:30" description: A Helm chart for Parseable Operator digest: 0bf4cd8cc7f1c5ff6d49f91fe91204855a215ae1cb5acaeb3fe84497bc97c566 name: operator type: application urls: - - https://charts.parseable.io/helm-releases/operator-0.0.2.tgz + - https://charts.parseable.com/helm-releases/operator-0.0.2.tgz version: 0.0.2 - apiVersion: v2 appVersion: 0.0.1 - created: "2024-01-04T13:22:39.336738866+05:30" + created: "2024-03-04T21:18:31.575560481+05:30" description: A Helm chart for Parseable Operator digest: 344cedd9e3a0f17c6ff09514dabed994bac7bac94ace500857d487c1c9cc1859 name: operator type: application urls: - - https://charts.parseable.io/helm-releases/operator-0.0.1.tgz + - https://charts.parseable.com/helm-releases/operator-0.0.1.tgz version: 0.0.1 parseable: + - apiVersion: v2 + appVersion: v0.9.0 + created: "2024-03-04T21:18:31.638535002+05:30" + dependencies: + - condition: vector.enabled + name: vector + repository: https://helm.vector.dev + version: 0.20.1 + - condition: fluent-bit.enabled + name: fluent-bit + repository: https://fluent.github.io/helm-charts + version: 0.25.0 + description: Helm chart for Parseable Server + digest: 75e202820f9ce1743bd1e50996b0f6db456b31d928fb4a99f9311f9e20d3a90f + maintainers: + - email: hi@parseable.com + name: Parseable Team + url: https://parseable.com + name: parseable + type: application + urls: + - https://charts.parseable.com/helm-releases/parseable-0.9.0.tgz + version: 0.9.0 + - apiVersion: v2 + appVersion: v0.8.1 + created: "2024-03-04T21:18:31.635712616+05:30" + dependencies: + - condition: vector.enabled + name: vector + repository: https://helm.vector.dev + version: 0.20.1 + - condition: fluent-bit.enabled + name: fluent-bit + repository: https://fluent.github.io/helm-charts + version: 0.25.0 + description: Helm chart for Parseable Server + digest: 3ba2fd1178541fe01472d66420567258ff76bf070848b2ab6b1eccf6b1565df6 + maintainers: + - email: hi@parseable.io + name: Parseable Team + url: https://parseable.io + name: parseable + type: application + urls: + - https://charts.parseable.com/helm-releases/parseable-0.8.1.tgz + version: 0.8.1 + - apiVersion: v2 + appVersion: v0.8.0 + created: "2024-03-04T21:18:31.633351793+05:30" + dependencies: + - condition: vector.enabled + name: vector + repository: https://helm.vector.dev + version: 0.20.1 + - condition: fluent-bit.enabled + name: fluent-bit + repository: https://fluent.github.io/helm-charts + version: 0.25.0 + description: Helm chart for Parseable Server + digest: 50320f397905c455b6dd1fe3120d0ce15d1ea9e044952ae208386f1858fbe75a + maintainers: + - email: hi@parseable.io + name: Parseable Team + url: https://parseable.io + name: parseable + type: application + urls: + - https://charts.parseable.com/helm-releases/parseable-0.8.0.tgz + version: 0.8.0 - apiVersion: v2 appVersion: v0.7.3 - created: "2024-01-04T13:22:39.39222597+05:30" + created: "2024-03-04T21:18:31.630982924+05:30" dependencies: - condition: vector.enabled name: vector @@ -56,7 +114,7 @@ entries: repository: https://fluent.github.io/helm-charts version: 0.25.0 description: Helm chart for Parseable Server - digest: 6c7ca517a725f96b03abf4adf1f09a4ee57a2623c1f6aecaa31f66a7bb19877f + digest: 879958276252d02af9ca360606ce6aa4b8872b14ad4816ecf9429e590076662b maintainers: - email: hi@parseable.io name: Parseable Team @@ -64,11 +122,11 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.7.3.tgz + - https://charts.parseable.com/helm-releases/parseable-0.7.3.tgz version: 0.7.3 - apiVersion: v2 appVersion: v0.7.2 - created: "2024-01-04T13:22:39.388808217+05:30" + created: "2024-03-04T21:18:31.62728704+05:30" dependencies: - condition: vector.enabled name: vector @@ -87,11 +145,11 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.7.2.tgz + - https://charts.parseable.com/helm-releases/parseable-0.7.2.tgz version: 0.7.2 - apiVersion: v2 appVersion: v0.7.1 - created: "2024-01-04T13:22:39.386689245+05:30" + created: "2024-03-04T21:18:31.624855086+05:30" dependencies: - condition: vector.enabled name: vector @@ -110,11 +168,11 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.7.1.tgz + - https://charts.parseable.com/helm-releases/parseable-0.7.1.tgz version: 0.7.1 - apiVersion: v2 appVersion: v0.7.0 - created: "2024-01-04T13:22:39.384539181+05:30" + created: "2024-03-04T21:18:31.62215143+05:30" dependencies: - condition: vector.enabled name: vector @@ -133,11 +191,11 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.7.0.tgz + - https://charts.parseable.com/helm-releases/parseable-0.7.0.tgz version: 0.7.0 - apiVersion: v2 appVersion: v0.6.2 - created: "2024-01-04T13:22:39.380706938+05:30" + created: "2024-03-04T21:18:31.618314254+05:30" dependencies: - condition: vector.enabled name: vector @@ -156,11 +214,11 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.6.2.tgz + - https://charts.parseable.com/helm-releases/parseable-0.6.2.tgz version: 0.6.2 - apiVersion: v2 appVersion: v0.6.1 - created: "2024-01-04T13:22:39.37727176+05:30" + created: "2024-03-04T21:18:31.616105468+05:30" dependencies: - condition: vector.enabled name: vector @@ -179,11 +237,11 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.6.1.tgz + - https://charts.parseable.com/helm-releases/parseable-0.6.1.tgz version: 0.6.1 - apiVersion: v2 appVersion: v0.6.0 - created: "2024-01-04T13:22:39.37384627+05:30" + created: "2024-03-04T21:18:31.613451542+05:30" dependencies: - condition: vector.enabled name: vector @@ -202,11 +260,11 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.6.0.tgz + - https://charts.parseable.com/helm-releases/parseable-0.6.0.tgz version: 0.6.0 - apiVersion: v2 appVersion: v0.5.1 - created: "2024-01-04T13:22:39.369713271+05:30" + created: "2024-03-04T21:18:31.610243878+05:30" dependencies: - condition: vector.enabled name: vector @@ -225,11 +283,11 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.5.1.tgz + - https://charts.parseable.com/helm-releases/parseable-0.5.1.tgz version: 0.5.1 - apiVersion: v2 appVersion: v0.5.0 - created: "2024-01-04T13:22:39.366281517+05:30" + created: "2024-03-04T21:18:31.608035931+05:30" dependencies: - condition: vector.enabled name: vector @@ -248,11 +306,11 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.5.0.tgz + - https://charts.parseable.com/helm-releases/parseable-0.5.0.tgz version: 0.5.0 - apiVersion: v2 appVersion: v0.4.4 - created: "2024-01-04T13:22:39.362784622+05:30" + created: "2024-03-04T21:18:31.605745147+05:30" dependencies: - condition: vector.enabled name: vector @@ -271,11 +329,11 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.4.5.tgz + - https://charts.parseable.com/helm-releases/parseable-0.4.5.tgz version: 0.4.5 - apiVersion: v2 appVersion: v0.4.3 - created: "2024-01-04T13:22:39.359584499+05:30" + created: "2024-03-04T21:18:31.602399707+05:30" dependencies: - condition: vector.enabled name: vector @@ -294,11 +352,11 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.4.4.tgz + - https://charts.parseable.com/helm-releases/parseable-0.4.4.tgz version: 0.4.4 - apiVersion: v2 appVersion: v0.4.2 - created: "2024-01-04T13:22:39.357447924+05:30" + created: "2024-03-04T21:18:31.598946927+05:30" dependencies: - condition: vector.enabled name: vector @@ -317,11 +375,11 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.4.3.tgz + - https://charts.parseable.com/helm-releases/parseable-0.4.3.tgz version: 0.4.3 - apiVersion: v2 appVersion: v0.4.1 - created: "2024-01-04T13:22:39.355364145+05:30" + created: "2024-03-04T21:18:31.595295298+05:30" dependencies: - condition: vector.enabled name: vector @@ -340,11 +398,11 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.4.2.tgz + - https://charts.parseable.com/helm-releases/parseable-0.4.2.tgz version: 0.4.2 - apiVersion: v2 appVersion: v0.4.0 - created: "2024-01-04T13:22:39.352152258+05:30" + created: "2024-03-04T21:18:31.59184319+05:30" dependencies: - condition: vector.enabled name: vector @@ -363,11 +421,11 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.4.1.tgz + - https://charts.parseable.com/helm-releases/parseable-0.4.1.tgz version: 0.4.1 - apiVersion: v2 appVersion: v0.4.0 - created: "2024-01-04T13:22:39.349617487+05:30" + created: "2024-03-04T21:18:31.588706658+05:30" dependencies: - condition: vector.enabled name: vector @@ -386,11 +444,11 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.4.0.tgz + - https://charts.parseable.com/helm-releases/parseable-0.4.0.tgz version: 0.4.0 - apiVersion: v2 appVersion: v0.3.1 - created: "2024-01-04T13:22:39.346764187+05:30" + created: "2024-03-04T21:18:31.585056257+05:30" dependencies: - condition: vector.enabled name: vector @@ -409,127 +467,127 @@ entries: name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.3.1.tgz + - https://charts.parseable.com/helm-releases/parseable-0.3.1.tgz version: 0.3.1 - apiVersion: v2 appVersion: v0.3.0 - created: "2024-01-04T13:22:39.342618937+05:30" + created: "2024-03-04T21:18:31.581498864+05:30" description: Helm chart for Parseable Server digest: ff30739229b727dc637f62fd4481c886a6080ce4556bae10cafe7642ddcfd937 name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.3.0.tgz + - https://charts.parseable.com/helm-releases/parseable-0.3.0.tgz version: 0.3.0 - apiVersion: v2 appVersion: v0.2.2 - created: "2024-01-04T13:22:39.342307108+05:30" + created: "2024-03-04T21:18:31.581284341+05:30" description: Helm chart for Parseable Server digest: 477d0dc2f0c07d4f4c32e105d4bdd70c71113add5c2a75ac5f1cb42aa0276db7 name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.2.2.tgz + - https://charts.parseable.com/helm-releases/parseable-0.2.2.tgz version: 0.2.2 - apiVersion: v2 appVersion: v0.2.1 - created: "2024-01-04T13:22:39.342005942+05:30" + created: "2024-03-04T21:18:31.581089413+05:30" description: Helm chart for Parseable Server digest: 84826fcd1b4c579f301569f43b0309c07e8082bad76f5cdd25f86e86ca2e8192 name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.2.1.tgz + - https://charts.parseable.com/helm-releases/parseable-0.2.1.tgz version: 0.2.1 - apiVersion: v2 appVersion: v0.2.0 - created: "2024-01-04T13:22:39.341730663+05:30" + created: "2024-03-04T21:18:31.580835869+05:30" description: Helm chart for Parseable Server digest: 7a759f7f9809f3935cba685e904c021a0b645f217f4e45b9be185900c467edff name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.2.0.tgz + - https://charts.parseable.com/helm-releases/parseable-0.2.0.tgz version: 0.2.0 - apiVersion: v2 appVersion: v0.1.1 - created: "2024-01-04T13:22:39.341457747+05:30" + created: "2024-03-04T21:18:31.580614668+05:30" description: Helm chart for Parseable Server digest: 37993cf392f662ec7b1fbfc9a2ba00ec906d98723e38f3c91ff1daca97c3d0b3 name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.1.1.tgz + - https://charts.parseable.com/helm-releases/parseable-0.1.1.tgz version: 0.1.1 - apiVersion: v2 appVersion: v0.1.0 - created: "2024-01-04T13:22:39.3411927+05:30" + created: "2024-03-04T21:18:31.580397661+05:30" description: Helm chart for Parseable Server digest: 1d580d072af8d6b1ebcbfee31c2e16c907d08db754780f913b5f0032b403789b name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.1.0.tgz + - https://charts.parseable.com/helm-releases/parseable-0.1.0.tgz version: 0.1.0 - apiVersion: v2 appVersion: v0.0.8 - created: "2024-01-04T13:22:39.340923513+05:30" + created: "2024-03-04T21:18:31.580177565+05:30" description: Helm chart for Parseable Server digest: c805254ffa634f96ecec448bcfff9973339aa9487dd8199b21b17b79a4de9345 name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.0.8.tgz + - https://charts.parseable.com/helm-releases/parseable-0.0.8.tgz version: 0.0.8 - apiVersion: v2 appVersion: v0.0.7 - created: "2024-01-04T13:22:39.340665206+05:30" + created: "2024-03-04T21:18:31.579952523+05:30" description: Helm chart for Parseable Server digest: c591f617ed1fe820bb2c72a4c976a78126f1d1095d552daa07c4700f46c4708a name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.0.7.tgz + - https://charts.parseable.com/helm-releases/parseable-0.0.7.tgz version: 0.0.7 - apiVersion: v2 appVersion: v0.0.6 - created: "2024-01-04T13:22:39.340401784+05:30" + created: "2024-03-04T21:18:31.5797268+05:30" description: Helm chart for Parseable Server digest: f9ae56a6fcd6a59e7bee0436200ddbedeb74ade6073deb435b8fcbaf08dda795 name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.0.6.tgz + - https://charts.parseable.com/helm-releases/parseable-0.0.6.tgz version: 0.0.6 - apiVersion: v2 appVersion: v0.0.5 - created: "2024-01-04T13:22:39.340138909+05:30" + created: "2024-03-04T21:18:31.579503422+05:30" description: Helm chart for Parseable Server digest: 4d6b08a064fba36e16feeb820b77e1e8e60fb6de48dbf7ec8410d03d10c26ad0 name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.0.5.tgz + - https://charts.parseable.com/helm-releases/parseable-0.0.5.tgz version: 0.0.5 - apiVersion: v2 appVersion: v0.0.2 - created: "2024-01-04T13:22:39.339859109+05:30" + created: "2024-03-04T21:18:31.579259667+05:30" description: Helm chart for Parseable Server digest: 38a0a3e4c498afbbcc76ebfcb9cb598fa2ca843a53cc93b3cb4f135b85c10844 name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.0.2.tgz + - https://charts.parseable.com/helm-releases/parseable-0.0.2.tgz version: 0.0.2 - apiVersion: v2 appVersion: v0.0.1 - created: "2024-01-04T13:22:39.339616047+05:30" + created: "2024-03-04T21:18:31.57896144+05:30" description: Helm chart for Parseable Server digest: 1f1142db092b9620ee38bb2294ccbb1c17f807b33bf56da43816af7fe89f301e name: parseable type: application urls: - - https://charts.parseable.io/helm-releases/parseable-0.0.1.tgz + - https://charts.parseable.com/helm-releases/parseable-0.0.1.tgz version: 0.0.1 parseable-operator: - apiVersion: v2 @@ -552,4 +610,4 @@ entries: urls: - https://charts.parseable.io/helm-releases/parseable-operator-0.0.1.tgz version: 0.0.1 -generated: "2024-01-04T13:22:39.335812243+05:30" +generated: "2024-03-04T21:18:31.573935838+05:30" diff --git a/server/Cargo.toml b/server/Cargo.toml index 57afe65c5..99eb9722a 100644 --- a/server/Cargo.toml +++ b/server/Cargo.toml @@ -1,9 +1,9 @@ [package] name = "parseable" -version = "0.7.3" -authors = ["Parseable Team "] +version = "1.0.0" +authors = ["Parseable Team "] edition = "2021" -rust-version = "1.73" +rust-version = "1.77.1" categories = ["logging", "observability", "log analytics"] build = "build.rs" @@ -103,6 +103,9 @@ http-auth-basic = "0.3.3" serde_repr = "0.1.17" hashlru = { version = "0.11.0", features = ["serde"] } path-clean = "1.0.1" +prost = "0.12.3" +prometheus-parse = "0.2.5" +sha2 = "0.10.8" [build-dependencies] cargo_toml = "0.15" @@ -112,14 +115,15 @@ ureq = "2.6" vergen = { version = "8.1", features = ["build", "git", "cargo", "gitcl"] } zip = { version = "0.6", default_features = false, features = ["deflate"] } url = "2.4.0" +prost-build = "0.12.3" [dev-dependencies] maplit = "1.0" rstest = "0.16" [package.metadata.parseable_ui] -assets-url = "https://github.com/parseablehq/console/releases/download/v0.3.5/build.zip" -assets-sha1 = "c3fea8f87769273f60fc988e0d1c45a94fe4d2ad" +assets-url = "https://github.com/parseablehq/console/releases/download/v0.6.1/build.zip" +assets-sha1 = "88035328ca6a045dd939615dc04b0c6ad9c6cb75" [features] debug = [] diff --git a/server/src/about.rs b/server/src/about.rs index c18df7600..f6680c919 100644 --- a/server/src/about.rs +++ b/server/src/about.rs @@ -93,7 +93,7 @@ pub fn print_about( Version: \"v{}\"", "About:".to_string().bold(), current_version, - ); + ); // " " " " if let Some(latest_release) = latest_release { if latest_release.version > current_version { @@ -104,7 +104,7 @@ pub fn print_about( eprintln!( " Commit: \"{commit_hash}\" - Docs: \"https://www.parseable.io/docs\"" + Docs: \"https://logg.ing/docs\"" ); } diff --git a/server/src/analytics.rs b/server/src/analytics.rs index ca8d172d6..d77e004f5 100644 --- a/server/src/analytics.rs +++ b/server/src/analytics.rs @@ -18,12 +18,16 @@ */ use crate::about::{current, platform}; -use crate::option::CONFIG; +use crate::handlers::http::cluster::utils::check_liveness; +use crate::handlers::http::{base_path_without_preceding_slash, cluster}; +use crate::option::{Mode, CONFIG}; use crate::storage; use crate::{metadata, stats}; +use actix_web::{web, HttpRequest, Responder}; use chrono::{DateTime, Utc}; use clokwerk::{AsyncScheduler, Interval}; +use http::header; use once_cell::sync::Lazy; use serde::{Deserialize, Serialize}; use serde_json::Value; @@ -56,14 +60,21 @@ pub struct Report { cpu_count: usize, memory_total_bytes: u64, platform: String, - mode: String, + storage_mode: String, + server_mode: String, version: String, commit_hash: String, + active_ingestors: u64, + inactive_ingestors: u64, + stream_count: usize, + total_events_count: u64, + total_json_bytes: u64, + total_parquet_bytes: u64, metrics: HashMap, } impl Report { - pub fn new() -> Self { + pub async fn new() -> anyhow::Result { let mut upt: f64 = 0.0; if let Ok(uptime) = uptime_lib::get() { upt = uptime.as_secs_f64(); @@ -80,8 +91,9 @@ impl Report { cpu_count = info.cpus().len(); mem_total = info.total_memory(); } + let ingestor_metrics = fetch_ingestors_metrics().await?; - Self { + Ok(Self { deployment_id: storage::StorageMetadata::global().deployment_id, uptime: upt, report_created_at: Utc::now(), @@ -90,11 +102,18 @@ impl Report { cpu_count, memory_total_bytes: mem_total, platform: platform().to_string(), - mode: CONFIG.mode_string().to_string(), + storage_mode: CONFIG.get_storage_mode_string().to_string(), + server_mode: CONFIG.parseable.mode.to_string(), version: current().released_version.to_string(), commit_hash: current().commit_hash, - metrics: build_metrics(), - } + active_ingestors: ingestor_metrics.0, + inactive_ingestors: ingestor_metrics.1, + stream_count: ingestor_metrics.2, + total_events_count: ingestor_metrics.3, + total_json_bytes: ingestor_metrics.4, + total_parquet_bytes: ingestor_metrics.5, + metrics: build_metrics().await, + }) } pub async fn send(&self) { @@ -103,6 +122,12 @@ impl Report { } } +/// build the node metrics for the node ingestor endpoint +pub async fn get_analytics(_: HttpRequest) -> impl Responder { + let json = NodeMetrics::build(); + web::Json(json) +} + fn total_streams() -> usize { metadata::STREAM_INFO.list_streams().len() } @@ -123,25 +148,65 @@ fn total_event_stats() -> (u64, u64, u64) { (total_events, total_json_bytes, total_parquet_bytes) } -fn build_metrics() -> HashMap { +async fn fetch_ingestors_metrics() -> anyhow::Result<(u64, u64, usize, u64, u64, u64)> { + let event_stats = total_event_stats(); + let mut node_metrics = + NodeMetrics::new(total_streams(), event_stats.0, event_stats.1, event_stats.2); + + let mut vec = vec![]; + let mut active_ingestors = 0u64; + let mut offline_ingestors = 0u64; + if CONFIG.parseable.mode == Mode::Query { + // send analytics for ingest servers + + // ingestor infos should be valid here, if not some thing is wrong + let ingestor_infos = cluster::get_ingestor_info().await.unwrap(); + + for im in ingestor_infos { + if !check_liveness(&im.domain_name).await { + offline_ingestors += 1; + continue; + } + + let uri = url::Url::parse(&format!( + "{}{}/analytics", + im.domain_name, + base_path_without_preceding_slash() + )) + .expect("Should be a valid URL"); + + let resp = reqwest::Client::new() + .get(uri) + .header(header::AUTHORIZATION, im.token.clone()) + .header(header::CONTENT_TYPE, "application/json") + .send() + .await + .expect("should respond"); + + let data = serde_json::from_slice::(&resp.bytes().await?)?; + vec.push(data); + active_ingestors += 1; + } + + node_metrics.accumulate(&mut vec); + } + + Ok(( + active_ingestors, + offline_ingestors, + node_metrics.stream_count, + node_metrics.total_events_count, + node_metrics.total_json_bytes, + node_metrics.total_parquet_bytes, + )) +} + +async fn build_metrics() -> HashMap { // sysinfo refreshed in previous function // so no need to refresh again let sys = SYS_INFO.lock().unwrap(); let mut metrics = HashMap::new(); - metrics.insert("stream_count".to_string(), total_streams().into()); - - // total_event_stats returns event count, json bytes, parquet bytes in that order - metrics.insert( - "total_events_count".to_string(), - total_event_stats().0.into(), - ); - metrics.insert("total_json_bytes".to_string(), total_event_stats().1.into()); - metrics.insert( - "total_parquet_bytes".to_string(), - total_event_stats().2.into(), - ); - metrics.insert("memory_in_use_bytes".to_string(), sys.used_memory().into()); metrics.insert("memory_free_bytes".to_string(), sys.free_memory().into()); @@ -155,14 +220,23 @@ fn build_metrics() -> HashMap { metrics } -pub fn init_analytics_scheduler() { +pub fn init_analytics_scheduler() -> anyhow::Result<()> { log::info!("Setting up schedular for anonymous user analytics"); let mut scheduler = AsyncScheduler::new(); scheduler .every(ANALYTICS_SEND_INTERVAL_SECONDS) .run(move || async { - Report::new().send().await; + Report::new() + .await + .unwrap_or_else(|err| { + // panicing because seperate thread + // TODO: a better way to handle this + log::error!("Error while sending analytics: {}", err.to_string()); + panic!("{}", err.to_string()); + }) + .send() + .await; }); tokio::spawn(async move { @@ -171,4 +245,48 @@ pub fn init_analytics_scheduler() { tokio::time::sleep(Duration::from_secs(10)).await; } }); + + Ok(()) +} + +#[derive(Serialize, Deserialize, Default, Debug)] +struct NodeMetrics { + stream_count: usize, + total_events_count: u64, + total_json_bytes: u64, + total_parquet_bytes: u64, +} + +impl NodeMetrics { + fn build() -> Self { + let event_stats = total_event_stats(); + Self { + stream_count: total_streams(), + total_events_count: event_stats.0, + total_json_bytes: event_stats.1, + total_parquet_bytes: event_stats.2, + } + } + + fn new( + stream_count: usize, + total_events_count: u64, + total_json_bytes: u64, + total_parquet_bytes: u64, + ) -> Self { + Self { + stream_count, + total_events_count, + total_json_bytes, + total_parquet_bytes, + } + } + + fn accumulate(&mut self, other: &mut [NodeMetrics]) { + other.iter().for_each(|nm| { + self.total_events_count += nm.total_events_count; + self.total_json_bytes += nm.total_json_bytes; + self.total_parquet_bytes += nm.total_parquet_bytes; + }); + } } diff --git a/server/src/banner.rs b/server/src/banner.rs index 0f1dc5120..ca665ffa4 100644 --- a/server/src/banner.rs +++ b/server/src/banner.rs @@ -35,13 +35,13 @@ pub async fn print(config: &Config, meta: &StorageMetadata) { fn print_ascii_art() { let ascii_name = r#" - `7MM"""Mq. *MM `7MM - MM `MM. MM MM - MM ,M9 ,6"Yb. `7Mb,od8 ,pP"Ybd .gP"Ya ,6"Yb. MM,dMMb. MM .gP"Ya - MMmmdM9 8) MM MM' "' 8I `" ,M' Yb 8) MM MM `Mb MM ,M' Yb - MM ,pm9MM MM `YMMMa. 8M"""""" ,pm9MM MM M8 MM 8M"""""" - MM 8M MM MM L. I8 YM. , 8M MM MM. ,M9 MM YM. , - .JMML. `Moo9^Yo..JMML. M9mmmP' `Mbmmd' `Moo9^Yo. P^YbmdP' .JMML. `Mbmmd' + `7MM"""Mq. *MM `7MM + MM `MM. MM MM + MM ,M9 ,6"Yb. `7Mb,od8 ,pP"Ybd .gP"Ya ,6"Yb. MM,dMMb. MM .gP"Ya + MMmmdM9 8) MM MM' "' 8I `" ,M' Yb 8) MM MM `Mb MM ,M' Yb + MM ,pm9MM MM `YMMMa. 8M"""""" ,pm9MM MM M8 MM 8M"""""" + MM 8M MM MM L. I8 YM. , 8M MM MM. ,M9 MM YM. , + .JMML. `Moo9^Yo..JMML. M9mmmP' `Mbmmd' `Moo9^Yo. P^YbmdP' .JMML. `Mbmmd' "#; eprint!("{ascii_name}"); @@ -79,10 +79,12 @@ fn status_info(config: &Config, scheme: &str, id: Uid) { {} Address: {} Credentials: {} + Server Mode: \"{}\" LLM Status: \"{}\"", "Server:".to_string().bold(), address, credentials, + config.get_server_mode_string(), llm_status ); } @@ -99,10 +101,10 @@ async fn storage_info(config: &Config) { eprintln!( " {} - Mode: \"{}\" - Staging: \"{}\"", + Storage Mode: \"{}\" + Staging Path: \"{}\"", "Storage:".to_string().bold(), - config.mode_string(), + config.get_storage_mode_string(), config.staging_dir().to_string_lossy(), ); @@ -114,7 +116,7 @@ async fn storage_info(config: &Config) { eprintln!( "\ - {:8}Cache: \"{}\", (size: {})", + {:8}Cache: \"{}\", (size: {})", "", path.display(), size diff --git a/server/src/catalog.rs b/server/src/catalog.rs index f8adad1ca..2ad1dfe0f 100644 --- a/server/src/catalog.rs +++ b/server/src/catalog.rs @@ -16,19 +16,21 @@ * */ -use std::sync::Arc; - -use chrono::{DateTime, NaiveDateTime, NaiveTime, Utc}; -use relative_path::RelativePathBuf; +use std::{io::ErrorKind, sync::Arc}; +use self::{column::Column, snapshot::ManifestItem}; +use crate::handlers::http::base_path_without_preceding_slash; +use crate::option::CONFIG; use crate::{ catalog::manifest::Manifest, query::PartialTimeFilter, - storage::{ObjectStorage, ObjectStorageError}, + storage::{object_storage::manifest_path, ObjectStorage, ObjectStorageError}, }; - -use self::{column::Column, snapshot::ManifestItem}; - +use crate::{handlers, Mode}; +use bytes::Bytes; +use chrono::{DateTime, Local, NaiveDateTime, NaiveTime, Utc}; +use relative_path::RelativePathBuf; +use std::io::Error as IOError; pub mod column; pub mod manifest; pub mod snapshot; @@ -69,56 +71,105 @@ impl ManifestFile for manifest::File { } } +fn get_file_bounds(file: &manifest::File) -> (DateTime, DateTime) { + match file + .columns() + .iter() + .find(|col| col.name == "p_timestamp") + .unwrap() + .stats + .clone() + .unwrap() + { + column::TypedStatistics::Int(stats) => ( + NaiveDateTime::from_timestamp_millis(stats.min) + .unwrap() + .and_utc(), + NaiveDateTime::from_timestamp_millis(stats.max) + .unwrap() + .and_utc(), + ), + _ => unreachable!(), + } +} + pub async fn update_snapshot( storage: Arc, stream_name: &str, change: manifest::File, ) -> Result<(), ObjectStorageError> { - fn get_file_bounds(file: &manifest::File) -> (DateTime, DateTime) { - match file - .columns() - .iter() - .find(|col| col.name == "p_timestamp") - .unwrap() - .stats - .clone() - .unwrap() - { - column::TypedStatistics::Int(stats) => ( - NaiveDateTime::from_timestamp_millis(stats.min) - .unwrap() - .and_utc(), - NaiveDateTime::from_timestamp_millis(stats.min) - .unwrap() - .and_utc(), - ), - _ => unreachable!(), - } - } - // get current snapshot - let mut meta = storage.get_snapshot(stream_name).await?; - let manifests = &mut meta.manifest_list; + let mut meta = storage.get_object_store_format(stream_name).await?; + let manifests = &mut meta.snapshot.manifest_list; let (lower_bound, _) = get_file_bounds(&change); let pos = manifests.iter().position(|item| { item.time_lower_bound <= lower_bound && lower_bound < item.time_upper_bound }); + // if the mode in I.S. manifest needs to be created but it is not getting created because + // there is already a pos, to index into stream.json + // We update the manifest referenced by this position // This updates an existing file so there is no need to create a snapshot entry. if let Some(pos) = pos { let info = &mut manifests[pos]; let path = partition_path(stream_name, info.time_lower_bound, info.time_upper_bound); - let Some(mut manifest) = storage.get_manifest(&path).await? else { - return Err(ObjectStorageError::UnhandledError( - "Manifest found in snapshot but not in object-storage" - .to_string() - .into(), - )); - }; - manifest.apply_change(change); - storage.put_manifest(&path, manifest).await?; + + let mut ch = false; + for m in manifests.iter() { + let p = manifest_path("").to_string(); + if m.manifest_path.contains(&p) { + ch = true; + } + } + if ch { + let Some(mut manifest) = storage.get_manifest(&path).await? else { + return Err(ObjectStorageError::UnhandledError( + "Manifest found in snapshot but not in object-storage" + .to_string() + .into(), + )); + }; + manifest.apply_change(change); + storage.put_manifest(&path, manifest).await?; + } else { + let lower_bound = lower_bound.date_naive().and_time(NaiveTime::MIN).and_utc(); + let upper_bound = lower_bound + .date_naive() + .and_time( + NaiveTime::from_num_seconds_from_midnight_opt( + 23 * 3600 + 59 * 60 + 59, + 999_999_999, + ) + .ok_or(IOError::new( + ErrorKind::Other, + "Failed to create upper bound for manifest", + )) + .map_err(ObjectStorageError::IoError)?, + ) + .and_utc(); + + let manifest = Manifest { + files: vec![change], + ..Manifest::default() + }; + + let mainfest_file_name = manifest_path("").to_string(); + let path = + partition_path(stream_name, lower_bound, upper_bound).join(&mainfest_file_name); + storage + .put_object(&path, serde_json::to_vec(&manifest)?.into()) + .await?; + let path = storage.absolute_url(&path); + let new_snapshot_entriy = snapshot::ManifestItem { + manifest_path: path.to_string(), + time_lower_bound: lower_bound, + time_upper_bound: upper_bound, + }; + manifests.push(new_snapshot_entriy); + storage.put_snapshot(stream_name, meta.snapshot).await?; + } } else { let lower_bound = lower_bound.date_naive().and_time(NaiveTime::MIN).and_utc(); let upper_bound = lower_bound @@ -137,7 +188,8 @@ pub async fn update_snapshot( ..Manifest::default() }; - let path = partition_path(stream_name, lower_bound, upper_bound).join("manifest.json"); + let mainfest_file_name = manifest_path("").to_string(); + let path = partition_path(stream_name, lower_bound, upper_bound).join(&mainfest_file_name); storage .put_object(&path, serde_json::to_vec(&manifest).unwrap().into()) .await?; @@ -148,12 +200,111 @@ pub async fn update_snapshot( time_upper_bound: upper_bound, }; manifests.push(new_snapshot_entriy); - storage.put_snapshot(stream_name, meta).await?; + storage.put_snapshot(stream_name, meta.snapshot).await?; } Ok(()) } +pub async fn remove_manifest_from_snapshot( + storage: Arc, + stream_name: &str, + dates: Vec, +) -> Result, ObjectStorageError> { + match CONFIG.parseable.mode { + Mode::All | Mode::Ingest => { + if !dates.is_empty() { + // get current snapshot + let mut meta = storage.get_object_store_format(stream_name).await?; + let manifests = &mut meta.snapshot.manifest_list; + // Filter out items whose manifest_path contains any of the dates_to_delete + manifests + .retain(|item| !dates.iter().any(|date| item.manifest_path.contains(date))); + storage.put_snapshot(stream_name, meta.snapshot).await?; + } + + let first_event_at = get_first_event(storage.clone(), stream_name, Vec::new()).await?; + + Ok(first_event_at) + } + Mode::Query => Ok(get_first_event(storage, stream_name, dates).await?), + } +} + +pub async fn get_first_event( + storage: Arc, + stream_name: &str, + dates: Vec, +) -> Result, ObjectStorageError> { + let mut first_event_at: String = String::default(); + match CONFIG.parseable.mode { + Mode::All | Mode::Ingest => { + // get current snapshot + let mut meta = storage.get_object_store_format(stream_name).await?; + let manifests = &mut meta.snapshot.manifest_list; + if manifests.is_empty() { + log::info!("No manifest found for stream {stream_name}"); + return Err(ObjectStorageError::Custom("No manifest found".to_string())); + } + let manifest = &manifests[0]; + let path = partition_path( + stream_name, + manifest.time_lower_bound, + manifest.time_upper_bound, + ); + let Some(manifest) = storage.get_manifest(&path).await? else { + return Err(ObjectStorageError::UnhandledError( + "Manifest found in snapshot but not in object-storage" + .to_string() + .into(), + )); + }; + if let Some(first_event) = manifest.files.first() { + let (lower_bound, _) = get_file_bounds(first_event); + first_event_at = lower_bound.with_timezone(&Local).to_rfc3339(); + } + } + Mode::Query => { + let ingestor_metadata = + handlers::http::cluster::get_ingestor_info() + .await + .map_err(|err| { + log::error!("Fatal: failed to get ingestor info: {:?}", err); + ObjectStorageError::from(err) + })?; + let mut ingestors_first_event_at: Vec = Vec::new(); + for ingestor in ingestor_metadata { + let url = format!( + "{}{}/logstream/{}/retention/cleanup", + ingestor.domain_name, + base_path_without_preceding_slash(), + stream_name + ); + // Convert dates vector to Bytes object + let dates_bytes = Bytes::from(serde_json::to_vec(&dates).unwrap()); + // delete the stream + + let ingestor_first_event_at = + handlers::http::cluster::send_retention_cleanup_request( + &url, + ingestor.clone(), + dates_bytes, + ) + .await?; + if !ingestor_first_event_at.is_empty() { + ingestors_first_event_at.push(ingestor_first_event_at); + } + } + if ingestors_first_event_at.is_empty() { + return Ok(None); + } + first_event_at = ingestors_first_event_at.iter().min().unwrap().to_string(); + } + } + + Ok(Some(first_event_at)) +} + /// Partition the path to which this manifest belongs. /// Useful when uploading the manifest file. fn partition_path( diff --git a/server/src/catalog/manifest.rs b/server/src/catalog/manifest.rs index bafed3dd5..ad5b32422 100644 --- a/server/src/catalog/manifest.rs +++ b/server/src/catalog/manifest.rs @@ -112,7 +112,6 @@ pub fn create_from_parquet_file( let columns = column_statistics(row_groups); manifest_file.columns = columns.into_values().collect(); let mut sort_orders = sort_order(row_groups); - if let Some(last_sort_order) = sort_orders.pop() { if sort_orders .into_iter() @@ -155,7 +154,7 @@ fn sort_order( }) .collect_vec(); - sort_orders.push(sort_order) + sort_orders.push(sort_order); } sort_orders } diff --git a/server/src/cli.rs b/server/src/cli.rs new file mode 100644 index 000000000..2ad9899cd --- /dev/null +++ b/server/src/cli.rs @@ -0,0 +1,469 @@ +/* + * Parseable Server (C) 2022 - 2024 Parseable, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +use clap::{value_parser, Arg, ArgGroup, Command, FromArgMatches}; +use std::path::PathBuf; + +use url::Url; + +use crate::{ + oidc::{self, OpenidConfig}, + option::{validation, Compression, Mode}, +}; + +#[derive(Debug, Default)] +pub struct Cli { + /// The location of TLS Cert file + pub tls_cert_path: Option, + + /// The location of TLS Private Key file + pub tls_key_path: Option, + + /// The address on which the http server will listen. + pub address: String, + + /// Base domain under which server is hosted. + /// This information is used by OIDC to refer redirects + pub domain_address: Option, + + /// The local staging path is used as a temporary landing point + /// for incoming events and local cache + pub local_staging_path: PathBuf, + + /// The local cache path is used for speeding up query on latest data + pub local_cache_path: Option, + + /// Size for local cache + pub local_cache_size: u64, + + /// Username for the basic authentication on the server + pub username: String, + + /// Password for the basic authentication on the server + pub password: String, + + /// OpenId configuration + pub openid: Option, + + /// Server should check for update or not + pub check_update: bool, + + /// Server should send anonymous analytics or not + pub send_analytics: bool, + + /// Open AI access key + pub open_ai_key: Option, + + /// Livetail port + pub grpc_port: u16, + + /// Livetail channel capacity + pub livetail_channel_capacity: usize, + + /// Rows in Parquet Rowgroup + pub row_group_size: usize, + + /// Query memory limit in bytes + pub query_memory_pool_size: Option, + + /// Parquet compression algorithm + pub parquet_compression: Compression, + + /// Mode of operation + pub mode: Mode, + + /// public address for the parseable server ingestor + pub ingestor_endpoint: String, +} + +impl Cli { + // identifiers for arguments + pub const TLS_CERT: &'static str = "tls-cert-path"; + pub const TLS_KEY: &'static str = "tls-key-path"; + pub const ADDRESS: &'static str = "address"; + pub const DOMAIN_URI: &'static str = "origin"; + pub const STAGING: &'static str = "local-staging-path"; + pub const CACHE: &'static str = "cache-path"; + pub const CACHE_SIZE: &'static str = "cache-size"; + pub const USERNAME: &'static str = "username"; + pub const PASSWORD: &'static str = "password"; + pub const CHECK_UPDATE: &'static str = "check-update"; + pub const SEND_ANALYTICS: &'static str = "send-analytics"; + pub const OPEN_AI_KEY: &'static str = "open-ai-key"; + pub const OPENID_CLIENT_ID: &'static str = "oidc-client"; + pub const OPENID_CLIENT_SECRET: &'static str = "oidc-client-secret"; + pub const OPENID_ISSUER: &'static str = "oidc-issuer"; + pub const GRPC_PORT: &'static str = "grpc-port"; + pub const LIVETAIL_CAPACITY: &'static str = "livetail-capacity"; + // todo : what should this flag be + pub const QUERY_MEM_POOL_SIZE: &'static str = "query-mempool-size"; + pub const ROW_GROUP_SIZE: &'static str = "row-group-size"; + pub const PARQUET_COMPRESSION_ALGO: &'static str = "compression-algo"; + pub const MODE: &'static str = "mode"; + pub const INGESTOR_ENDPOINT: &'static str = "ingestor-endpoint"; + pub const DEFAULT_USERNAME: &'static str = "admin"; + pub const DEFAULT_PASSWORD: &'static str = "admin"; + + pub fn local_stream_data_path(&self, stream_name: &str) -> PathBuf { + self.local_staging_path.join(stream_name) + } + + pub fn get_scheme(&self) -> String { + if self.tls_cert_path.is_some() && self.tls_key_path.is_some() { + return "https".to_string(); + } + "http".to_string() + } + + pub fn create_cli_command_with_clap(name: &'static str) -> Command { + Command::new(name).next_line_help(false) + .arg( + Arg::new(Self::TLS_CERT) + .long(Self::TLS_CERT) + .env("P_TLS_CERT_PATH") + .value_name("PATH") + .value_parser(validation::file_path) + .help("Local path on this device where certificate file is located. Required to enable TLS"), + ) + .arg( + Arg::new(Self::TLS_KEY) + .long(Self::TLS_KEY) + .env("P_TLS_KEY_PATH") + .value_name("PATH") + .value_parser(validation::file_path) + .help("Local path on this device where private key file is located. Required to enable TLS"), + ) + .arg( + Arg::new(Self::ADDRESS) + .long(Self::ADDRESS) + .env("P_ADDR") + .value_name("ADDR:PORT") + .default_value("0.0.0.0:8000") + .value_parser(validation::socket_addr) + .help("Address and port for Parseable HTTP(s) server"), + ) + .arg( + Arg::new(Self::STAGING) + .long(Self::STAGING) + .env("P_STAGING_DIR") + .value_name("DIR") + .default_value("./staging") + .value_parser(validation::canonicalize_path) + .help("Local path on this device to be used as landing point for incoming events") + .next_line_help(true), + ) + .arg( + Arg::new(Self::CACHE) + .long(Self::CACHE) + .env("P_CACHE_DIR") + .value_name("DIR") + .value_parser(validation::canonicalize_path) + .help("Local path on this device to be used for caching data") + .next_line_help(true), + ) + .arg( + Arg::new(Self::CACHE_SIZE) + .long(Self::CACHE_SIZE) + .env("P_CACHE_SIZE") + .value_name("size") + .default_value("1GiB") + .value_parser(validation::cache_size) + .help("Maximum allowed cache size for all streams combined (In human readable format, e.g 1GiB, 2GiB, 100MB)") + .next_line_help(true), + ) + + .arg( + Arg::new(Self::USERNAME) + .long(Self::USERNAME) + .env("P_USERNAME") + .value_name("STRING") + .required(true) + .help("Admin username to be set for this Parseable server"), + ) + .arg( + Arg::new(Self::PASSWORD) + .long(Self::PASSWORD) + .env("P_PASSWORD") + .value_name("STRING") + .required(true) + .help("Admin password to be set for this Parseable server"), + ) + .arg( + Arg::new(Self::CHECK_UPDATE) + .long(Self::CHECK_UPDATE) + .env("P_CHECK_UPDATE") + .value_name("BOOL") + .required(false) + .default_value("true") + .value_parser(value_parser!(bool)) + .help("Enable/Disable checking for new Parseable release"), + ) + .arg( + Arg::new(Self::SEND_ANALYTICS) + .long(Self::SEND_ANALYTICS) + .env("P_SEND_ANONYMOUS_USAGE_DATA") + .value_name("BOOL") + .required(false) + .default_value("true") + .value_parser(value_parser!(bool)) + .help("Enable/Disable anonymous telemetry data collection"), + ) + .arg( + Arg::new(Self::OPEN_AI_KEY) + .long(Self::OPEN_AI_KEY) + .env("P_OPENAI_API_KEY") + .value_name("STRING") + .required(false) + .help("OpenAI key to enable llm features"), + ) + .arg( + Arg::new(Self::OPENID_CLIENT_ID) + .long(Self::OPENID_CLIENT_ID) + .env("P_OIDC_CLIENT_ID") + .value_name("STRING") + .required(false) + .help("Client id for OIDC provider"), + ) + .arg( + Arg::new(Self::OPENID_CLIENT_SECRET) + .long(Self::OPENID_CLIENT_SECRET) + .env("P_OIDC_CLIENT_SECRET") + .value_name("STRING") + .required(false) + .help("Client secret for OIDC provider"), + ) + .arg( + Arg::new(Self::OPENID_ISSUER) + .long(Self::OPENID_ISSUER) + .env("P_OIDC_ISSUER") + .value_name("URL") + .required(false) + .value_parser(validation::url) + .help("OIDC provider's host address"), + ) + .arg( + Arg::new(Self::DOMAIN_URI) + .long(Self::DOMAIN_URI) + .env("P_ORIGIN_URI") + .value_name("URL") + .required(false) + .value_parser(validation::url) + .help("Parseable server global domain address"), + ) + .arg( + Arg::new(Self::GRPC_PORT) + .long(Self::GRPC_PORT) + .env("P_GRPC_PORT") + .value_name("PORT") + .default_value("8001") + .required(false) + .value_parser(value_parser!(u16)) + .help("Port for gRPC server"), + ) + .arg( + Arg::new(Self::LIVETAIL_CAPACITY) + .long(Self::LIVETAIL_CAPACITY) + .env("P_LIVETAIL_CAPACITY") + .value_name("NUMBER") + .default_value("1000") + .required(false) + .value_parser(value_parser!(usize)) + .help("Number of rows in livetail channel"), + ) + .arg( + Arg::new(Self::QUERY_MEM_POOL_SIZE) + .long(Self::QUERY_MEM_POOL_SIZE) + .env("P_QUERY_MEMORY_LIMIT") + .value_name("Gib") + .required(false) + .value_parser(value_parser!(u8)) + .help("Set a fixed memory limit for query"), + ) + .arg( + Arg::new(Self::ROW_GROUP_SIZE) + .long(Self::ROW_GROUP_SIZE) + .env("P_PARQUET_ROW_GROUP_SIZE") + .value_name("NUMBER") + .required(false) + .default_value("16384") + .value_parser(value_parser!(usize)) + .help("Number of rows in a row group"), + ).arg( + Arg::new(Self::MODE) + .long(Self::MODE) + .env("P_MODE") + .value_name("STRING") + .required(false) + .default_value("all") + .value_parser([ + "query", + "ingest", + "all"]) + .help("Mode of operation"), + ) + .arg( + Arg::new(Self::INGESTOR_ENDPOINT) + .long(Self::INGESTOR_ENDPOINT) + .env("P_INGESTOR_ENDPOINT") + .value_name("URL") + .required(false) + .help("URL to connect to this specific ingestor. Default is the address of the server.") + ) + .arg( + Arg::new(Self::PARQUET_COMPRESSION_ALGO) + .long(Self::PARQUET_COMPRESSION_ALGO) + .env("P_PARQUET_COMPRESSION_ALGO") + .value_name("[UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD]") + .required(false) + .default_value("lz4") + .value_parser([ + "uncompressed", + "snappy", + "gzip", + "lzo", + "brotli", + "lz4", + "zstd"]) + .help("Parquet compression algorithm"), + ).group( + ArgGroup::new("oidc") + .args([Self::OPENID_CLIENT_ID, Self::OPENID_CLIENT_SECRET, Self::OPENID_ISSUER]) + .requires_all([Self::OPENID_CLIENT_ID, Self::OPENID_CLIENT_SECRET, Self::OPENID_ISSUER]) + .multiple(true) + ) + } +} + +impl FromArgMatches for Cli { + fn from_arg_matches(m: &clap::ArgMatches) -> Result { + let mut s: Self = Self::default(); + s.update_from_arg_matches(m)?; + Ok(s) + } + + fn update_from_arg_matches(&mut self, m: &clap::ArgMatches) -> Result<(), clap::Error> { + self.local_cache_path = m.get_one::(Self::CACHE).cloned(); + self.tls_cert_path = m.get_one::(Self::TLS_CERT).cloned(); + self.tls_key_path = m.get_one::(Self::TLS_KEY).cloned(); + self.domain_address = m.get_one::(Self::DOMAIN_URI).cloned(); + + self.address = m + .get_one::(Self::ADDRESS) + .cloned() + .expect("default value for address"); + + self.ingestor_endpoint = m + .get_one::(Self::INGESTOR_ENDPOINT) + .cloned() + .unwrap_or_else(String::default); + + self.local_staging_path = m + .get_one::(Self::STAGING) + .cloned() + .expect("default value for staging"); + self.local_cache_size = m + .get_one::(Self::CACHE_SIZE) + .cloned() + .expect("default value for cache size"); + self.username = m + .get_one::(Self::USERNAME) + .cloned() + .expect("default for username"); + self.password = m + .get_one::(Self::PASSWORD) + .cloned() + .expect("default for password"); + self.check_update = m + .get_one::(Self::CHECK_UPDATE) + .cloned() + .expect("default for check update"); + self.send_analytics = m + .get_one::(Self::SEND_ANALYTICS) + .cloned() + .expect("default for send analytics"); + self.open_ai_key = m.get_one::(Self::OPEN_AI_KEY).cloned(); + self.grpc_port = m + .get_one::(Self::GRPC_PORT) + .cloned() + .expect("default for livetail port"); + self.livetail_channel_capacity = m + .get_one::(Self::LIVETAIL_CAPACITY) + .cloned() + .expect("default for livetail capacity"); + // converts Gib to bytes before assigning + self.query_memory_pool_size = m + .get_one::(Self::QUERY_MEM_POOL_SIZE) + .cloned() + .map(|gib| gib as usize * 1024usize.pow(3)); + self.row_group_size = m + .get_one::(Self::ROW_GROUP_SIZE) + .cloned() + .expect("default for row_group size"); + self.parquet_compression = match m + .get_one::(Self::PARQUET_COMPRESSION_ALGO) + .expect("default for compression algo") + .as_str() + { + "uncompressed" => Compression::UNCOMPRESSED, + "snappy" => Compression::SNAPPY, + "gzip" => Compression::GZIP, + "lzo" => Compression::LZO, + "brotli" => Compression::BROTLI, + "lz4" => Compression::LZ4, + "zstd" => Compression::ZSTD, + _ => unreachable!(), + }; + + let openid_client_id = m.get_one::(Self::OPENID_CLIENT_ID).cloned(); + let openid_client_secret = m.get_one::(Self::OPENID_CLIENT_SECRET).cloned(); + let openid_issuer = m.get_one::(Self::OPENID_ISSUER).cloned(); + + self.openid = match (openid_client_id, openid_client_secret, openid_issuer) { + (Some(id), Some(secret), Some(issuer)) => { + let origin = if let Some(url) = self.domain_address.clone() { + oidc::Origin::Production(url) + } else { + oidc::Origin::Local { + socket_addr: self.address.clone(), + https: self.tls_cert_path.is_some() && self.tls_key_path.is_some(), + } + }; + Some(OpenidConfig { + id, + secret, + issuer, + origin, + }) + } + _ => None, + }; + + self.mode = match m + .get_one::(Self::MODE) + .expect("Mode not set") + .as_str() + { + "query" => Mode::Query, + "ingest" => Mode::Ingest, + "all" => Mode::All, + _ => unreachable!(), + }; + + Ok(()) + } +} diff --git a/server/src/event.rs b/server/src/event.rs index 62db832bf..98774daf2 100644 --- a/server/src/event.rs +++ b/server/src/event.rs @@ -26,10 +26,9 @@ use itertools::Itertools; use std::sync::Arc; -use crate::metadata; - use self::error::EventError; pub use self::writer::STREAM_WRITERS; +use crate::metadata; pub const DEFAULT_TIMESTAMP_KEY: &str = "p_timestamp"; pub const DEFAULT_TAGS_KEY: &str = "p_tags"; diff --git a/server/src/event/format.rs b/server/src/event/format.rs index 55f8e106a..169f35e23 100644 --- a/server/src/event/format.rs +++ b/server/src/event/format.rs @@ -41,26 +41,34 @@ pub trait EventFormat: Sized { fn to_data( self, schema: HashMap>, + time_partition: Option, + static_schema_flag: Option, ) -> Result<(Self::Data, EventSchema, bool, Tags, Metadata), AnyError>; fn decode(data: Self::Data, schema: Arc) -> Result; fn into_recordbatch( self, - schema: HashMap>, + storage_schema: HashMap>, + time_partition: Option, + static_schema_flag: Option, ) -> Result<(RecordBatch, bool), AnyError> { - let (data, mut schema, is_first, tags, metadata) = self.to_data(schema)?; + let (data, mut schema, is_first, tags, metadata) = self.to_data( + storage_schema.clone(), + time_partition, + static_schema_flag.clone(), + )?; if get_field(&schema, DEFAULT_TAGS_KEY).is_some() { return Err(anyhow!("field {} is a reserved field", DEFAULT_TAGS_KEY)); }; - if get_field(&schema, DEFAULT_TAGS_KEY).is_some() { + if get_field(&schema, DEFAULT_METADATA_KEY).is_some() { return Err(anyhow!( "field {} is a reserved field", DEFAULT_METADATA_KEY )); }; - if get_field(&schema, DEFAULT_TAGS_KEY).is_some() { + if get_field(&schema, DEFAULT_TIMESTAMP_KEY).is_some() { return Err(anyhow!( "field {} is a reserved field", DEFAULT_TIMESTAMP_KEY @@ -88,14 +96,17 @@ pub trait EventFormat: Sized { ))); // prepare the record batch and new fields to be added - let schema = Arc::new(Schema::new(schema)); - let rb = Self::decode(data, schema.clone())?; + let new_schema = Arc::new(Schema::new(schema)); + if !Self::is_schema_matching(new_schema.clone(), storage_schema, static_schema_flag) { + return Err(anyhow!("Schema mismatch")); + } + let rb = Self::decode(data, new_schema.clone())?; let tags_arr = StringArray::from_iter_values(std::iter::repeat(&tags).take(rb.num_rows())); let metadata_arr = StringArray::from_iter_values(std::iter::repeat(&metadata).take(rb.num_rows())); // modify the record batch to add fields to respective indexes let rb = utils::arrow::replace_columns( - Arc::clone(&schema), + Arc::clone(&new_schema), &rb, &[tags_index, metadata_index], &[Arc::new(tags_arr), Arc::new(metadata_arr)], @@ -103,4 +114,32 @@ pub trait EventFormat: Sized { Ok((rb, is_first)) } + + fn is_schema_matching( + new_schema: Arc, + storage_schema: HashMap>, + static_schema_flag: Option, + ) -> bool { + if static_schema_flag.is_none() { + return true; + } + for (field_name, field) in new_schema + .fields() + .iter() + .map(|field| (field.name().to_owned(), field.clone())) + .collect::>>() + { + if let Some(storage_field) = storage_schema.get(&field_name) { + if field_name != *storage_field.name() { + return false; + } + if field.data_type() != storage_field.data_type() { + return false; + } + } else { + return false; + } + } + true + } } diff --git a/server/src/event/format/json.rs b/server/src/event/format/json.rs index edeb395b3..b31acbedc 100644 --- a/server/src/event/format/json.rs +++ b/server/src/event/format/json.rs @@ -45,8 +45,10 @@ impl EventFormat for Event { fn to_data( self, schema: HashMap>, + time_partition: Option, + static_schema_flag: Option, ) -> Result<(Self::Data, Vec>, bool, Tags, Metadata), anyhow::Error> { - let data = flatten_json_body(self.data)?; + let data = flatten_json_body(self.data, time_partition)?; let stream_schema = schema; // incoming event may be a single json or a json array @@ -90,9 +92,10 @@ impl EventFormat for Event { }, }; - if value_arr - .iter() - .any(|value| fields_mismatch(&schema, value)) + if static_schema_flag.is_none() + && value_arr + .iter() + .any(|value| fields_mismatch(&schema, value)) { return Err(anyhow!( "Could not process this event due to mismatch in datatype" diff --git a/server/src/event/writer/file_writer.rs b/server/src/event/writer/file_writer.rs index 9ff62c5c3..d90e361e3 100644 --- a/server/src/event/writer/file_writer.rs +++ b/server/src/event/writer/file_writer.rs @@ -17,13 +17,12 @@ * */ -use std::collections::HashMap; -use std::fs::{File, OpenOptions}; -use std::path::PathBuf; - use arrow_array::RecordBatch; use arrow_ipc::writer::StreamWriter; use derive_more::{Deref, DerefMut}; +use std::collections::HashMap; +use std::fs::{File, OpenOptions}; +use std::path::PathBuf; use crate::storage::staging::StorageDir; @@ -83,7 +82,6 @@ fn init_new_stream_writer_file( ) -> Result<(PathBuf, StreamWriter), StreamWriterError> { let dir = StorageDir::new(stream_name); let path = dir.path_by_current_time(schema_key); - std::fs::create_dir_all(dir.data_path)?; let file = OpenOptions::new().create(true).append(true).open(&path)?; @@ -94,6 +92,5 @@ fn init_new_stream_writer_file( stream_writer .write(record) .map_err(StreamWriterError::Writer)?; - Ok((path, stream_writer)) } diff --git a/server/src/handlers.rs b/server/src/handlers.rs index 81beea0bd..57e4aebcb 100644 --- a/server/src/handlers.rs +++ b/server/src/handlers.rs @@ -23,7 +23,8 @@ const PREFIX_TAGS: &str = "x-p-tag-"; const PREFIX_META: &str = "x-p-meta-"; const STREAM_NAME_HEADER_KEY: &str = "x-p-stream"; const LOG_SOURCE_KEY: &str = "x-p-log-source"; - +const TIME_PARTITION_KEY: &str = "x-p-time-partition"; +const STATIC_SCHEMA_FLAG: &str = "x-p-static-schema-flag"; const AUTHORIZATION_KEY: &str = "authorization"; const SEPARATOR: char = '^'; diff --git a/server/src/handlers/http.rs b/server/src/handlers/http.rs index 3fad5c2cc..6044b74ac 100644 --- a/server/src/handlers/http.rs +++ b/server/src/handlers/http.rs @@ -16,346 +16,116 @@ * */ -use std::fs::File; -use std::io::BufReader; -use std::sync::Arc; - use actix_cors::Cors; -use actix_web::{ - web::{self, resource}, - App, HttpServer, -}; -use actix_web_prometheus::PrometheusMetrics; -use actix_web_static_files::ResourceFiles; -use log::info; -use openid::Discovered; -use rustls::{Certificate, PrivateKey, ServerConfig}; -use rustls_pemfile::{certs, pkcs8_private_keys}; +use arrow_schema::Schema; +use itertools::Itertools; +use serde_json::Value; use crate::option::CONFIG; -use crate::rbac::role::Action; -use self::middleware::{DisAllowRootUser, RouteExt}; +use self::{cluster::get_ingestor_info, query::Query}; -mod about; -mod health_check; -mod ingest; +pub(crate) mod about; +pub mod cluster; +pub(crate) mod health_check; +pub(crate) mod ingest; mod kinesis; -mod llm; -mod logstream; -mod middleware; -mod oidc; -mod query; -mod rbac; -mod role; - -include!(concat!(env!("OUT_DIR"), "/generated.rs")); - -const MAX_EVENT_PAYLOAD_SIZE: usize = 10485760; -const API_BASE_PATH: &str = "/api"; -const API_VERSION: &str = "v1"; - -pub async fn run_http( - prometheus: PrometheusMetrics, - oidc_client: Option, -) -> anyhow::Result<()> { - let oidc_client = match oidc_client { - Some(config) => { - let client = config - .connect(&format!("{API_BASE_PATH}/{API_VERSION}/o/code")) - .await?; - Some(Arc::new(client)) - } - None => None, - }; - - let create_app = move || { - App::new() - .wrap(prometheus.clone()) - .configure(|cfg| configure_routes(cfg, oidc_client.clone())) - .wrap(actix_web::middleware::Logger::default()) - .wrap(actix_web::middleware::Compress::default()) - .wrap(cross_origin_config()) - }; - - let ssl_acceptor = match ( - &CONFIG.parseable.tls_cert_path, - &CONFIG.parseable.tls_key_path, - ) { - (Some(cert), Some(key)) => { - // init server config builder with safe defaults - let config = ServerConfig::builder() - .with_safe_defaults() - .with_no_client_auth(); - - // load TLS key/cert files - let cert_file = &mut BufReader::new(File::open(cert)?); - let key_file = &mut BufReader::new(File::open(key)?); - - // convert files to key/cert objects - let cert_chain = certs(cert_file)?.into_iter().map(Certificate).collect(); - - let mut keys: Vec = pkcs8_private_keys(key_file)? - .into_iter() - .map(PrivateKey) - .collect(); - - // exit if no keys could be parsed - if keys.is_empty() { - anyhow::bail!("Could not locate PKCS 8 private keys."); - } - - let server_config = config.with_single_cert(cert_chain, keys.remove(0))?; - - Some(server_config) - } - (_, _) => None, - }; - - // concurrent workers equal to number of cores on the cpu - let http_server = HttpServer::new(create_app).workers(num_cpus::get()); - if let Some(config) = ssl_acceptor { - http_server - .bind_rustls(&CONFIG.parseable.address, config)? - .run() - .await?; - } else { - http_server.bind(&CONFIG.parseable.address)?.run().await?; - } - - Ok(()) +pub(crate) mod llm; +pub(crate) mod logstream; +pub(crate) mod middleware; +pub mod modal; +pub(crate) mod oidc; +mod otel; +pub(crate) mod query; +pub(crate) mod rbac; +pub(crate) mod role; + +pub const MAX_EVENT_PAYLOAD_SIZE: usize = 10485760; +pub const API_BASE_PATH: &str = "api"; +pub const API_VERSION: &str = "v1"; + +pub(crate) fn base_path() -> String { + format!("/{API_BASE_PATH}/{API_VERSION}") } -pub fn configure_routes( - cfg: &mut web::ServiceConfig, - oidc_client: Option>>, -) { - let generated = generate(); - - //log stream API - let logstream_api = web::scope("/{logstream}") - .service( - web::resource("") - // PUT "/logstream/{logstream}" ==> Create log stream - .route( - web::put() - .to(logstream::put_stream) - .authorize_for_stream(Action::CreateStream), - ) - // POST "/logstream/{logstream}" ==> Post logs to given log stream - .route( - web::post() - .to(ingest::post_event) - .authorize_for_stream(Action::Ingest), - ) - // DELETE "/logstream/{logstream}" ==> Delete log stream - .route( - web::delete() - .to(logstream::delete) - .authorize_for_stream(Action::DeleteStream), - ) - .app_data(web::PayloadConfig::default().limit(MAX_EVENT_PAYLOAD_SIZE)), - ) - .service( - web::resource("/alert") - // PUT "/logstream/{logstream}/alert" ==> Set alert for given log stream - .route( - web::put() - .to(logstream::put_alert) - .authorize_for_stream(Action::PutAlert), - ) - // GET "/logstream/{logstream}/alert" ==> Get alert for given log stream - .route( - web::get() - .to(logstream::get_alert) - .authorize_for_stream(Action::GetAlert), - ), - ) - .service( - // GET "/logstream/{logstream}/schema" ==> Get schema for given log stream - web::resource("/schema").route( - web::get() - .to(logstream::schema) - .authorize_for_stream(Action::GetSchema), - ), - ) - .service( - // GET "/logstream/{logstream}/stats" ==> Get stats for given log stream - web::resource("/stats").route( - web::get() - .to(logstream::get_stats) - .authorize_for_stream(Action::GetStats), - ), - ) - .service( - web::resource("/retention") - // PUT "/logstream/{logstream}/retention" ==> Set retention for given logstream - .route( - web::put() - .to(logstream::put_retention) - .authorize_for_stream(Action::PutRetention), - ) - // GET "/logstream/{logstream}/retention" ==> Get retention for given logstream - .route( - web::get() - .to(logstream::get_retention) - .authorize_for_stream(Action::GetRetention), - ), - ) - .service( - web::resource("/cache") - // PUT "/logstream/{logstream}/cache" ==> Set retention for given logstream - .route( - web::put() - .to(logstream::put_enable_cache) - .authorize_for_stream(Action::PutCacheEnabled), - ) - // GET "/logstream/{logstream}/cache" ==> Get retention for given logstream - .route( - web::get() - .to(logstream::get_cache_enabled) - .authorize_for_stream(Action::GetCacheEnabled), - ), - ); - - // User API - let user_api = web::scope("/user") - .service( - web::resource("") - // GET /user => List all users - .route(web::get().to(rbac::list_users).authorize(Action::ListUser)), - ) - .service( - web::resource("/{username}") - // PUT /user/{username} => Create a new user - .route(web::post().to(rbac::post_user).authorize(Action::PutUser)) - // DELETE /user/{username} => Delete a user - .route( - web::delete() - .to(rbac::delete_user) - .authorize(Action::DeleteUser), - ) - .wrap(DisAllowRootUser), - ) - .service( - web::resource("/{username}/role") - // PUT /user/{username}/roles => Put roles for user - .route( - web::put() - .to(rbac::put_role) - .authorize(Action::PutUserRoles) - .wrap(DisAllowRootUser), - ) - .route( - web::get() - .to(rbac::get_role) - .authorize_for_user(Action::GetUserRoles), - ), - ) - .service( - web::resource("/{username}/generate-new-password") - // POST /user/{username}/generate-new-password => reset password for this user - .route( - web::post() - .to(rbac::post_gen_password) - .authorize(Action::PutUser) - .wrap(DisAllowRootUser), - ), - ); - - let llm_query_api = web::scope("/llm").service( - web::resource("").route( - web::post() - .to(llm::make_llm_request) - .authorize(Action::QueryLLM), - ), - ); - - let role_api = web::scope("/role") - .service(resource("").route(web::get().to(role::list).authorize(Action::ListRole))) - .service( - resource("/default") - .route(web::put().to(role::put_default).authorize(Action::PutRole)) - .route(web::get().to(role::get_default).authorize(Action::GetRole)), - ) - .service( - resource("/{name}") - .route(web::put().to(role::put).authorize(Action::PutRole)) - .route(web::delete().to(role::delete).authorize(Action::DeleteRole)) - .route(web::get().to(role::get).authorize(Action::GetRole)), - ); - - let mut oauth_api = web::scope("/o") - .service(resource("/login").route(web::get().to(oidc::login))) - .service(resource("/logout").route(web::get().to(oidc::logout))) - .service(resource("/code").route(web::get().to(oidc::reply_login))); +pub fn metrics_path() -> String { + format!("{}/metrics", base_path()) +} - if let Some(client) = oidc_client { - info!("Registered oidc client"); - oauth_api = oauth_api.app_data(web::Data::from(client)) +pub(crate) fn cross_origin_config() -> Cors { + if cfg!(feature = "debug") { + Cors::permissive().block_on_origin_mismatch(false) + } else { + Cors::default().block_on_origin_mismatch(false) } - - // Deny request if username is same as the env variable P_USERNAME. - cfg.service( - // Base path "{url}/api/v1" - web::scope(&base_path()) - // POST "/query" ==> Get results of the SQL query passed in request body - .service( - web::resource("/query") - .route(web::post().to(query::query).authorize(Action::Query)), - ) - // POST "/ingest" ==> Post logs to given log stream based on header - .service( - web::resource("/ingest") - .route( - web::post() - .to(ingest::ingest) - .authorize_for_stream(Action::Ingest), - ) - .app_data(web::PayloadConfig::default().limit(MAX_EVENT_PAYLOAD_SIZE)), - ) - // GET "/liveness" ==> Liveness check as per https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-liveness-command - .service(web::resource("/liveness").route(web::get().to(health_check::liveness))) - // GET "/readiness" ==> Readiness check as per https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-readiness-probes - .service(web::resource("/readiness").route(web::get().to(health_check::readiness))) - // GET "/about" ==> Returns information about instance - .service( - web::resource("/about") - .route(web::get().to(about::about).authorize(Action::GetAbout)), - ) - .service( - web::scope("/logstream") - .service( - // GET "/logstream" ==> Get list of all Log Streams on the server - web::resource("") - .route(web::get().to(logstream::list).authorize(Action::ListStream)), - ) - .service( - // logstream API - logstream_api, - ), - ) - .service(user_api) - .service(llm_query_api) - .service(oauth_api) - .service(role_api), - ) - // GET "/" ==> Serve the static frontend directory - .service(ResourceFiles::new("/", generated).resolve_not_found_to_root()); } -fn base_path() -> String { +pub fn base_path_without_preceding_slash() -> String { format!("{API_BASE_PATH}/{API_VERSION}") } -pub fn metrics_path() -> String { - format!("{}/metrics", base_path()) +/// Fetches the schema for the specified stream. +/// +/// # Arguments +/// +/// * `stream_name` - The name of the stream to fetch the schema for. +/// +/// # Returns +/// +/// An `anyhow::Result` containing the `arrow_schema::Schema` for the specified stream. +pub async fn fetch_schema(stream_name: &str) -> anyhow::Result { + let path_prefix = + relative_path::RelativePathBuf::from(format!("{}/{}", stream_name, ".stream")); + let store = CONFIG.storage().get_object_store(); + let res: Vec = store + .get_objects( + Some(&path_prefix), + Box::new(|file_name: String| file_name.contains(".schema")), + ) + .await? + .iter() + // we should be able to unwrap as we know the data is valid schema + .map(|byte_obj| serde_json::from_slice(byte_obj).expect("data is valid json")) + .collect_vec(); + + let new_schema = Schema::try_merge(res)?; + Ok(new_schema) } -fn cross_origin_config() -> Cors { - if cfg!(feature = "debug") { - Cors::permissive().block_on_origin_mismatch(false) - } else { - Cors::default().block_on_origin_mismatch(false) +/// unused for now, might need it later +#[allow(unused)] +pub async fn send_query_request_to_ingestor(query: &Query) -> anyhow::Result> { + // send the query request to the ingestor + let mut res = vec![]; + let ima = get_ingestor_info().await?; + + for im in ima.iter() { + let uri = format!( + "{}{}/{}", + im.domain_name, + base_path_without_preceding_slash(), + "query" + ); + let reqw = reqwest::Client::new() + .post(uri) + .json(query) + .header(http::header::AUTHORIZATION, im.token.clone()) + .header(http::header::CONTENT_TYPE, "application/json") + .send() + .await; + + if let Ok(reqw) = reqw { + // do i need to do a success check?? + let v: Value = serde_json::from_slice(&reqw.bytes().await?)?; + // the value returned is an array of json objects + // so it needs to be flattened + if let Some(arr) = v.as_array() { + for val in arr { + res.push(val.to_owned()) + } + } + } } + + Ok(res) } diff --git a/server/src/handlers/http/about.rs b/server/src/handlers/http/about.rs index 3f42ccc4f..1603139ff 100644 --- a/server/src/handlers/http/about.rs +++ b/server/src/handlers/http/about.rs @@ -20,9 +20,34 @@ use actix_web::web::Json; use human_size::SpecificSize; use serde_json::json; -use crate::{about, option::CONFIG, storage::StorageMetadata, utils::update}; +use crate::{ + about, + option::{Mode, CONFIG}, + storage::StorageMetadata, + utils::update, +}; use std::path::PathBuf; +/// { +/// "version": current_version, +/// "uiVersion": ui_version, +/// "commit": commit, +/// "deploymentId": deployment_id, +/// "updateAvailable": update_available, +/// "latestVersion": latest_release, +/// "llmActive": is_llm_active, +/// "llmProvider": llm_provider, +/// "oidcActive": is_oidc_active, +/// "license": "AGPL-3.0-only", +/// "mode": mode, +/// "staging": staging, +/// "cache": cache_details, +/// "grpcPort": grpc_port, +/// "store": { +/// "type": CONFIG.get_storage_mode_string(), +/// "path": store_endpoint +/// } +/// } pub async fn about() -> Json { let meta = StorageMetadata::global(); @@ -40,11 +65,15 @@ pub async fn about() -> Json { let current_version = format!("v{}", current_release.released_version); let commit = current_release.commit_hash; let deployment_id = meta.deployment_id.to_string(); - let mode = CONFIG.mode_string(); - let staging = CONFIG.staging_dir(); + let mode = CONFIG.get_server_mode_string(); + let staging = if CONFIG.parseable.mode == Mode::Query { + "".to_string() + } else { + CONFIG.staging_dir().display().to_string() + }; let grpc_port = CONFIG.parseable.grpc_port; - let store = CONFIG.storage().get_endpoint(); + let store_endpoint = CONFIG.storage().get_endpoint(); let is_llm_active = &CONFIG.parseable.open_ai_key.is_some(); let llm_provider = is_llm_active.then_some("OpenAI"); let is_oidc_active = CONFIG.parseable.openid.is_some(); @@ -80,6 +109,9 @@ pub async fn about() -> Json { "staging": staging, "cache": cache_details, "grpcPort": grpc_port, - "store": store + "store": { + "type": CONFIG.get_storage_mode_string(), + "path": store_endpoint + } })) } diff --git a/server/src/handlers/http/cluster/mod.rs b/server/src/handlers/http/cluster/mod.rs new file mode 100644 index 000000000..88c563251 --- /dev/null +++ b/server/src/handlers/http/cluster/mod.rs @@ -0,0 +1,498 @@ +/* + * Parseable Server (C) 2022 - 2024 Parseable, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +pub mod utils; + +use crate::handlers::http::cluster::utils::{ + check_liveness, to_url_string, IngestionStats, QueriedStats, +}; +use crate::handlers::http::ingest::PostError; +use crate::handlers::http::logstream::error::StreamError; +use crate::handlers::{STATIC_SCHEMA_FLAG, TIME_PARTITION_KEY}; +use crate::option::CONFIG; + +use crate::metrics::prom_utils::Metrics; +use crate::storage::object_storage::ingestor_metadata_path; +use crate::storage::{ObjectStorageError, STREAM_ROOT_DIRECTORY}; +use crate::storage::{ObjectStoreFormat, PARSEABLE_ROOT_DIRECTORY}; +use actix_web::http::header; +use actix_web::{HttpRequest, Responder}; +use bytes::Bytes; +use chrono::Utc; +use http::StatusCode; +use itertools::Itertools; +use relative_path::RelativePathBuf; +use serde::de::Error; +use serde_json::error::Error as SerdeError; +use serde_json::Value as JsonValue; +use url::Url; +type IngestorMetadataArr = Vec; + +use self::utils::StorageStats; + +use super::base_path_without_preceding_slash; + +use super::modal::IngestorMetadata; + +pub async fn sync_cache_with_ingestors( + url: &str, + ingestor: IngestorMetadata, + body: bool, +) -> Result<(), StreamError> { + if !utils::check_liveness(&ingestor.domain_name).await { + return Ok(()); + } + let request_body: Bytes = Bytes::from(body.to_string()); + let client = reqwest::Client::new(); + let resp = client + .put(url) + .header(header::CONTENT_TYPE, "application/json") + .header(header::AUTHORIZATION, ingestor.token) + .body(request_body) + .send() + .await + .map_err(|err| { + // log the error and return a custom error + log::error!( + "Fatal: failed to set cache: {}\n Error: {:?}", + ingestor.domain_name, + err + ); + StreamError::Network(err) + })?; + + // if the response is not successful, log the error and return a custom error + // this could be a bit too much, but we need to be sure it covers all cases + if !resp.status().is_success() { + log::error!( + "failed to set cache: {}\nResponse Returned: {:?}", + ingestor.domain_name, + resp.text().await + ); + } + + Ok(()) +} + +// forward the request to all ingestors to keep them in sync +#[allow(dead_code)] +pub async fn sync_streams_with_ingestors( + stream_name: &str, + time_partition: &str, + static_schema: &str, + schema: Bytes, +) -> Result<(), StreamError> { + let ingestor_infos = get_ingestor_info().await.map_err(|err| { + log::error!("Fatal: failed to get ingestor info: {:?}", err); + StreamError::Anyhow(err) + })?; + + let mut errored = false; + for ingestor in ingestor_infos.iter() { + let url = format!( + "{}{}/logstream/{}", + ingestor.domain_name, + base_path_without_preceding_slash(), + stream_name + ); + + match send_stream_sync_request( + &url, + ingestor.clone(), + time_partition, + static_schema, + schema.clone(), + ) + .await + { + Ok(_) => continue, + Err(_) => { + errored = true; + break; + } + } + } + + if errored { + for ingestor in ingestor_infos { + let url = format!( + "{}{}/logstream/{}", + ingestor.domain_name, + base_path_without_preceding_slash(), + stream_name + ); + + // delete the stream + send_stream_delete_request(&url, ingestor.clone()).await?; + } + + // this might be a bit too much + return Err(StreamError::Custom { + msg: "Failed to sync stream with ingestors".to_string(), + status: StatusCode::INTERNAL_SERVER_ERROR, + }); + } + + Ok(()) +} + +/// get the cumulative stats from all ingestors +pub async fn fetch_stats_from_ingestors( + stream_name: &str, +) -> Result, StreamError> { + let path = RelativePathBuf::from_iter([stream_name, STREAM_ROOT_DIRECTORY]); + let obs = CONFIG + .storage() + .get_object_store() + .get_objects( + Some(&path), + Box::new(|file_name| file_name.starts_with(".ingestor")), + ) + .await?; + let mut ingestion_size = 0u64; + let mut storage_size = 0u64; + let mut count = 0u64; + for ob in obs { + if let Ok(stat) = serde_json::from_slice::(&ob) { + count += stat.stats.events; + ingestion_size += stat.stats.ingestion; + storage_size += stat.stats.storage; + } + } + + let qs = QueriedStats::new( + "", + Utc::now(), + IngestionStats::new(count, format!("{} Bytes", ingestion_size), "json"), + StorageStats::new(format!("{} Bytes", storage_size), "parquet"), + ); + + Ok(vec![qs]) +} + +#[allow(dead_code)] +async fn send_stream_sync_request( + url: &str, + ingestor: IngestorMetadata, + time_partition: &str, + static_schema: &str, + schema: Bytes, +) -> Result<(), StreamError> { + if !utils::check_liveness(&ingestor.domain_name).await { + return Ok(()); + } + + let client = reqwest::Client::new(); + let res = client + .put(url) + .header(header::CONTENT_TYPE, "application/json") + .header(TIME_PARTITION_KEY, time_partition) + .header(STATIC_SCHEMA_FLAG, static_schema) + .header(header::AUTHORIZATION, ingestor.token) + .body(schema) + .send() + .await + .map_err(|err| { + log::error!( + "Fatal: failed to forward create stream request to ingestor: {}\n Error: {:?}", + ingestor.domain_name, + err + ); + StreamError::Network(err) + })?; + + if !res.status().is_success() { + log::error!( + "failed to forward create stream request to ingestor: {}\nResponse Returned: {:?}", + ingestor.domain_name, + res + ); + return Err(StreamError::Network(res.error_for_status().unwrap_err())); + } + + Ok(()) +} + +/// send a delete stream request to all ingestors +pub async fn send_stream_delete_request( + url: &str, + ingestor: IngestorMetadata, +) -> Result<(), StreamError> { + if !utils::check_liveness(&ingestor.domain_name).await { + return Ok(()); + } + let client = reqwest::Client::new(); + let resp = client + .delete(url) + .header(header::CONTENT_TYPE, "application/json") + .header(header::AUTHORIZATION, ingestor.token) + .send() + .await + .map_err(|err| { + // log the error and return a custom error + log::error!( + "Fatal: failed to delete stream: {}\n Error: {:?}", + ingestor.domain_name, + err + ); + StreamError::Network(err) + })?; + + // if the response is not successful, log the error and return a custom error + // this could be a bit too much, but we need to be sure it covers all cases + if !resp.status().is_success() { + log::error!( + "failed to delete stream: {}\nResponse Returned: {:?}", + ingestor.domain_name, + resp.text().await + ); + } + + Ok(()) +} + +/// send a retention cleanup request to all ingestors +pub async fn send_retention_cleanup_request( + url: &str, + ingestor: IngestorMetadata, + body: Bytes, +) -> Result { + let mut first_event_at: String = String::default(); + if !utils::check_liveness(&ingestor.domain_name).await { + return Ok(first_event_at); + } + let client = reqwest::Client::new(); + let resp = client + .post(url) + .header(header::CONTENT_TYPE, "application/json") + .header(header::AUTHORIZATION, ingestor.token) + .body(body) + .send() + .await + .map_err(|err| { + // log the error and return a custom error + log::error!( + "Fatal: failed to perform cleanup on retention: {}\n Error: {:?}", + ingestor.domain_name, + err + ); + ObjectStorageError::Custom(err.to_string()) + })?; + + // if the response is not successful, log the error and return a custom error + // this could be a bit too much, but we need to be sure it covers all cases + if !resp.status().is_success() { + log::error!( + "failed to perform cleanup on retention: {}\nResponse Returned: {:?}", + ingestor.domain_name, + resp.status() + ); + } + + let resp_data = resp.bytes().await.map_err(|err| { + log::error!("Fatal: failed to parse response to bytes: {:?}", err); + ObjectStorageError::Custom(err.to_string()) + })?; + + first_event_at = String::from_utf8_lossy(&resp_data).to_string(); + Ok(first_event_at) +} + +pub async fn get_cluster_info() -> Result { + let ingestor_infos = get_ingestor_info().await.map_err(|err| { + log::error!("Fatal: failed to get ingestor info: {:?}", err); + StreamError::Anyhow(err) + })?; + + let mut infos = vec![]; + + for ingestor in ingestor_infos { + let uri = Url::parse(&format!( + "{}{}/about", + ingestor.domain_name, + base_path_without_preceding_slash() + )) + .expect("should always be a valid url"); + + let resp = reqwest::Client::new() + .get(uri) + .header(header::AUTHORIZATION, ingestor.token.clone()) + .header(header::CONTENT_TYPE, "application/json") + .send() + .await; + + let (reachable, staging_path, error, status) = if let Ok(resp) = resp { + let status = Some(resp.status().to_string()); + + let resp_data = resp.bytes().await.map_err(|err| { + log::error!("Fatal: failed to parse ingestor info to bytes: {:?}", err); + StreamError::Network(err) + })?; + + let sp = serde_json::from_slice::(&resp_data) + .map_err(|err| { + log::error!("Fatal: failed to parse ingestor info: {:?}", err); + StreamError::SerdeError(err) + })? + .get("staging") + .ok_or(StreamError::SerdeError(SerdeError::missing_field( + "staging", + )))? + .as_str() + .ok_or(StreamError::SerdeError(SerdeError::custom( + "staging path not a string/ not provided", + )))? + .to_string(); + + (true, sp, None, status) + } else { + ( + false, + "".to_owned(), + resp.as_ref().err().map(|e| e.to_string()), + resp.unwrap_err().status().map(|s| s.to_string()), + ) + }; + + infos.push(utils::ClusterInfo::new( + &ingestor.domain_name, + reachable, + staging_path, + CONFIG.storage().get_endpoint(), + error, + status, + )); + } + + Ok(actix_web::HttpResponse::Ok().json(infos)) +} + +pub async fn get_cluster_metrics() -> Result { + let ingestor_metadata = get_ingestor_info().await.map_err(|err| { + log::error!("Fatal: failed to get ingestor info: {:?}", err); + PostError::Invalid(err) + })?; + + let mut dresses = vec![]; + + for ingestor in ingestor_metadata { + let uri = Url::parse(&format!( + "{}{}/metrics", + &ingestor.domain_name, + base_path_without_preceding_slash() + )) + .map_err(|err| { + PostError::Invalid(anyhow::anyhow!("Invalid URL in Ingestor Metadata: {}", err)) + })?; + + let res = reqwest::Client::new() + .get(uri) + .header(header::CONTENT_TYPE, "application/json") + .send() + .await; + + if let Ok(res) = res { + let text = res.text().await.map_err(PostError::NetworkError)?; + let lines: Vec> = + text.lines().map(|line| Ok(line.to_owned())).collect_vec(); + + let sample = prometheus_parse::Scrape::parse(lines.into_iter()) + .map_err(|err| PostError::CustomError(err.to_string()))? + .samples; + + dresses.push(Metrics::from_prometheus_samples( + sample, + ingestor.domain_name, + )); + } else { + log::warn!( + "Failed to fetch metrics from ingestor: {}\n", + ingestor.domain_name, + ); + } + } + + Ok(actix_web::HttpResponse::Ok().json(dresses)) +} + +// update the .query.json file and return the new ingestorMetadataArr +pub async fn get_ingestor_info() -> anyhow::Result { + let store = CONFIG.storage().get_object_store(); + + let root_path = RelativePathBuf::from(PARSEABLE_ROOT_DIRECTORY); + let arr = store + .get_objects( + Some(&root_path), + Box::new(|file_name| file_name.starts_with("ingestor")), + ) + .await? + .iter() + // this unwrap will most definateley shoot me in the foot later + .map(|x| serde_json::from_slice::(x).unwrap_or_default()) + .collect_vec(); + + Ok(arr) +} + +pub async fn remove_ingestor(req: HttpRequest) -> Result { + let domain_name: String = req.match_info().get("ingestor").unwrap().parse().unwrap(); + let domain_name = to_url_string(domain_name); + + if check_liveness(&domain_name).await { + return Err(PostError::Invalid(anyhow::anyhow!("Node Online"))); + } + let object_store = CONFIG.storage().get_object_store(); + + let ingestor_metadatas = object_store + .get_objects( + Some(&RelativePathBuf::from(PARSEABLE_ROOT_DIRECTORY)), + Box::new(|file_name| file_name.starts_with("ingestor")), + ) + .await?; + + let ingestor_metadata = ingestor_metadatas + .iter() + .map(|elem| serde_json::from_slice::(elem).unwrap_or_default()) + .collect_vec(); + + let ingestor_metadata = ingestor_metadata + .iter() + .filter(|elem| elem.domain_name == domain_name) + .collect_vec(); + + let ingestor_meta_filename = + ingestor_metadata_path(Some(&ingestor_metadata[0].ingestor_id)).to_string(); + let msg = match object_store + .try_delete_ingestor_meta(ingestor_meta_filename) + .await + { + Ok(_) => { + format!("Node {} Removed Successfully", domain_name) + } + Err(err) => { + if matches!(err, ObjectStorageError::IoError(_)) { + format!("Node {} Not Found", domain_name) + } else { + format!("Error Removing Node {}\n Reason: {}", domain_name, err) + } + } + }; + + log::info!("{}", &msg); + Ok((msg, StatusCode::OK)) +} diff --git a/server/src/handlers/http/cluster/utils.rs b/server/src/handlers/http/cluster/utils.rs new file mode 100644 index 000000000..579ffa99c --- /dev/null +++ b/server/src/handlers/http/cluster/utils.rs @@ -0,0 +1,269 @@ +/* + * Parseable Server (C) 2022 - 2024 Parseable, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +use crate::handlers::http::{logstream::error::StreamError, modal::IngestorMetadata}; +use actix_web::http::header; +use chrono::{DateTime, Utc}; +use http::StatusCode; +use itertools::Itertools; +use reqwest::Response; +use serde::{Deserialize, Serialize}; +use url::Url; + +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct QueriedStats { + pub stream: String, + pub time: DateTime, + pub ingestion: IngestionStats, + pub storage: StorageStats, +} + +impl QueriedStats { + pub fn new( + stream: &str, + time: DateTime, + ingestion: IngestionStats, + storage: StorageStats, + ) -> Self { + Self { + stream: stream.to_string(), + time, + ingestion, + storage, + } + } +} + +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct ClusterInfo { + domain_name: String, + reachable: bool, + staging_path: String, + storage_path: String, + error: Option, // error message if the ingestor is not reachable + status: Option, // status message if the ingestor is reachable +} + +impl ClusterInfo { + pub fn new( + domain_name: &str, + reachable: bool, + staging_path: String, + storage_path: String, + error: Option, + status: Option, + ) -> Self { + Self { + domain_name: domain_name.to_string(), + reachable, + staging_path, + storage_path, + error, + status, + } + } +} + +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct IngestionStats { + pub count: u64, + pub size: String, + pub format: String, +} + +impl IngestionStats { + pub fn new(count: u64, size: String, format: &str) -> Self { + Self { + count, + size, + format: format.to_string(), + } + } +} + +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct StorageStats { + pub size: String, + pub format: String, +} + +impl StorageStats { + pub fn new(size: String, format: &str) -> Self { + Self { + size, + format: format.to_string(), + } + } +} + +pub fn merge_quried_stats(stats: Vec) -> QueriedStats { + // get the actual creation time + // let min_creation_time = stats + // .iter() + // .map(|x| x.creation_time.parse::>().unwrap()) + // .min() + // .unwrap(); // should never be None + + // get the stream name + let stream_name = stats[1].stream.clone(); + + // get the first event at + // let min_first_event_at = stats + // .iter() + // .map(|x| match x.first_event_at.as_ref() { + // we can directly unwrap here because + // we are sure that the first_event_at is a valid date + // Some(fea) => fea.parse::>().unwrap(), + // None => Utc::now(), // current time ie the max time + // }) + // .min() + // .unwrap(); // should never be None + + let min_time = stats.iter().map(|x| x.time).min().unwrap_or_else(Utc::now); + + let cumulative_ingestion = + stats + .iter() + .map(|x| &x.ingestion) + .fold(IngestionStats::default(), |acc, x| IngestionStats { + count: acc.count + x.count, + size: format!( + "{} Bytes", + acc.size.split(' ').collect_vec()[0] + .parse::() + .unwrap_or_default() + + x.size.split(' ').collect_vec()[0] + .parse::() + .unwrap_or_default() + ), + format: x.format.clone(), + }); + + let cumulative_storage = + stats + .iter() + .map(|x| &x.storage) + .fold(StorageStats::default(), |acc, x| StorageStats { + size: format!( + "{} Bytes", + acc.size.split(' ').collect_vec()[0] + .parse::() + .unwrap_or_default() + + x.size.split(' ').collect_vec()[0] + .parse::() + .unwrap_or_default() + ), + format: x.format.clone(), + }); + + QueriedStats::new( + &stream_name, + min_time, + cumulative_ingestion, + cumulative_storage, + ) +} + +pub async fn check_liveness(domain_name: &str) -> bool { + let uri = match Url::parse(&format!("{}liveness", domain_name)) { + Ok(uri) => uri, + Err(err) => { + log::error!("Node Indentifier Failed To Parse: {}", err); + return false; + } + }; + + let reqw = reqwest::Client::new() + .get(uri) + .header(header::CONTENT_TYPE, "application/json") + .send() + .await; + + reqw.is_ok() +} + +/// send a request to the ingestor to fetch its stats +/// dead for now +#[allow(dead_code)] +pub async fn send_stats_request( + url: &str, + ingestor: IngestorMetadata, +) -> Result, StreamError> { + if !check_liveness(&ingestor.domain_name).await { + return Ok(None); + } + + let client = reqwest::Client::new(); + let res = client + .get(url) + .header(header::CONTENT_TYPE, "application/json") + .header(header::AUTHORIZATION, ingestor.token) + .send() + .await + .map_err(|err| { + log::error!( + "Fatal: failed to fetch stats from ingestor: {}\n Error: {:?}", + ingestor.domain_name, + err + ); + + StreamError::Network(err) + })?; + + if !res.status().is_success() { + log::error!( + "failed to forward create stream request to ingestor: {}\nResponse Returned: {:?}", + ingestor.domain_name, + res + ); + return Err(StreamError::Custom { + msg: format!( + "failed to forward create stream request to ingestor: {}\nResponse Returned: {:?}", + ingestor.domain_name, + res.text().await.unwrap_or_default() + ), + status: StatusCode::INTERNAL_SERVER_ERROR, + }); + } + + Ok(Some(res)) +} + +/// domain_name needs to be http://ip:port +/// dead code for now +#[allow(dead_code)] +pub fn ingestor_meta_filename(domain_name: &str) -> String { + if domain_name.starts_with("http://") | domain_name.starts_with("https://") { + let url = Url::parse(domain_name).unwrap(); + return format!( + "ingestor.{}.{}.json", + url.host_str().unwrap(), + url.port().unwrap() + ); + } + format!("ingestor.{}.json", domain_name) +} + +pub fn to_url_string(str: String) -> String { + // if the str is already a url i am guessing that it will end in '/' + if str.starts_with("http://") || str.starts_with("https://") { + return str; + } + + format!("http://{}/", str) +} diff --git a/server/src/handlers/http/ingest.rs b/server/src/handlers/http/ingest.rs index 429c3ffcd..673d14c4c 100644 --- a/server/src/handlers/http/ingest.rs +++ b/server/src/handlers/http/ingest.rs @@ -16,27 +16,28 @@ * */ +use super::logstream::error::CreateStreamError; +use super::{kinesis, otel}; +use crate::event::{ + self, + error::EventError, + format::{self, EventFormat}, +}; +use crate::handlers::{ + LOG_SOURCE_KEY, LOG_SOURCE_KINESIS, LOG_SOURCE_OTEL, PREFIX_META, PREFIX_TAGS, SEPARATOR, + STREAM_NAME_HEADER_KEY, +}; +use crate::metadata::{self, STREAM_INFO}; +use crate::option::{Mode, CONFIG}; +use crate::storage::{LogStream, ObjectStorageError}; +use crate::utils::header_parsing::{collect_labelled_headers, ParseHeaderError}; use actix_web::{http::header::ContentType, HttpRequest, HttpResponse}; -use arrow_schema::Field; +use arrow_schema::{Field, Schema}; use bytes::Bytes; use http::StatusCode; use serde_json::Value; use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; - -use crate::event::error::EventError; -use crate::event::format::EventFormat; -use crate::event::{self, format}; -use crate::handlers::{ - LOG_SOURCE_KEY, LOG_SOURCE_KINESIS, LOG_SOURCE_OTEL, PREFIX_META, PREFIX_TAGS, SEPARATOR, - STREAM_NAME_HEADER_KEY, -}; -use crate::metadata::STREAM_INFO; -use crate::utils::header_parsing::{collect_labelled_headers, ParseHeaderError}; - -use super::kinesis; -use super::logstream::error::CreateStreamError; - // Handler for POST /api/v1/ingest // ingests events by extracting stream name from header // creates if stream does not exist @@ -67,7 +68,7 @@ async fn flatten_and_push_logs( let log_source: String = log_source.to_str().unwrap().to_owned(); match log_source.as_str() { LOG_SOURCE_KINESIS => json = kinesis::flatten_kinesis_logs(&body), - LOG_SOURCE_OTEL => {} + LOG_SOURCE_OTEL => json = otel::flatten_otel_logs(&body), _ => { log::warn!("Unknown log source: {}", log_source); push_logs(stream_name.to_string(), req.clone(), body).await?; @@ -101,7 +102,18 @@ async fn push_logs(stream_name: String, req: HttpRequest, body: Bytes) -> Result .ok_or(PostError::StreamNotFound(stream_name.clone()))? .schema .clone(); - into_event_batch(req, body, schema)? + let time_partition = hash_map + .get(&stream_name) + .ok_or(PostError::StreamNotFound(stream_name.clone()))? + .time_partition + .clone(); + let static_schema_flag = hash_map + .get(&stream_name) + .ok_or(PostError::StreamNotFound(stream_name.clone()))? + .static_schema_flag + .clone(); + + into_event_batch(req, body, schema, time_partition, static_schema_flag)? }; event::Event { @@ -121,6 +133,8 @@ fn into_event_batch( req: HttpRequest, body: Bytes, schema: HashMap>, + time_partition: Option, + static_schema_flag: Option, ) -> Result<(usize, arrow_array::RecordBatch, bool), PostError> { let tags = collect_labelled_headers(&req, PREFIX_TAGS, SEPARATOR)?; let metadata = collect_labelled_headers(&req, PREFIX_META, SEPARATOR)?; @@ -131,7 +145,7 @@ fn into_event_batch( tags, metadata, }; - let (rb, is_first) = event.into_recordbatch(schema)?; + let (rb, is_first) = event.into_recordbatch(schema, time_partition, static_schema_flag)?; Ok((size, rb, is_first)) } @@ -140,7 +154,41 @@ pub async fn create_stream_if_not_exists(stream_name: &str) -> Result<(), PostEr if STREAM_INFO.stream_exists(stream_name) { return Ok(()); } - super::logstream::create_stream(stream_name.to_string()).await?; + match &CONFIG.parseable.mode { + Mode::All | Mode::Query => { + super::logstream::create_stream( + stream_name.to_string(), + "", + "", + Arc::new(Schema::empty()), + ) + .await?; + } + Mode::Ingest => { + // here the ingest server has not found the stream + // so it should check if the stream exists in storage + let store = CONFIG.storage().get_object_store(); + let streams = store.list_streams().await?; + if !streams.contains(&LogStream { + name: stream_name.to_owned(), + }) { + log::error!("Stream {} not found", stream_name); + return Err(PostError::Invalid(anyhow::anyhow!( + "Stream {} not found. Has it been created?", + stream_name + ))); + } + metadata::STREAM_INFO + .upsert_stream_info( + &*store, + LogStream { + name: stream_name.to_owned(), + }, + ) + .await + .map_err(|_| PostError::StreamNotFound(stream_name.to_owned()))?; + } + } Ok(()) } @@ -158,6 +206,12 @@ pub enum PostError { Invalid(#[from] anyhow::Error), #[error("{0}")] CreateStream(#[from] CreateStreamError), + #[error("Error: {0}")] + CustomError(String), + #[error("Error: {0}")] + NetworkError(#[from] reqwest::Error), + #[error("ObjectStorageError: {0}")] + ObjectStorageError(#[from] ObjectStorageError), } impl actix_web::ResponseError for PostError { @@ -172,6 +226,9 @@ impl actix_web::ResponseError for PostError { } PostError::CreateStream(_) => StatusCode::INTERNAL_SERVER_ERROR, PostError::StreamNotFound(_) => StatusCode::NOT_FOUND, + PostError::CustomError(_) => StatusCode::INTERNAL_SERVER_ERROR, + PostError::NetworkError(_) => StatusCode::INTERNAL_SERVER_ERROR, + PostError::ObjectStorageError(_) => StatusCode::INTERNAL_SERVER_ERROR, } } @@ -243,6 +300,8 @@ mod tests { req, Bytes::from(serde_json::to_vec(&json).unwrap()), HashMap::default(), + None, + None, ) .unwrap(); @@ -289,6 +348,8 @@ mod tests { req, Bytes::from(serde_json::to_vec(&json).unwrap()), HashMap::default(), + None, + None, ) .unwrap(); @@ -322,8 +383,14 @@ mod tests { let req = TestRequest::default().to_http_request(); - let (_, rb, _) = - into_event_batch(req, Bytes::from(serde_json::to_vec(&json).unwrap()), schema).unwrap(); + let (_, rb, _) = into_event_batch( + req, + Bytes::from(serde_json::to_vec(&json).unwrap()), + schema, + None, + None, + ) + .unwrap(); assert_eq!(rb.num_rows(), 1); assert_eq!(rb.num_columns(), 5); @@ -355,10 +422,14 @@ mod tests { let req = TestRequest::default().to_http_request(); - assert!( - into_event_batch(req, Bytes::from(serde_json::to_vec(&json).unwrap()), schema,) - .is_err() - ); + assert!(into_event_batch( + req, + Bytes::from(serde_json::to_vec(&json).unwrap()), + schema, + None, + None + ) + .is_err()); } #[test] @@ -376,8 +447,14 @@ mod tests { let req = TestRequest::default().to_http_request(); - let (_, rb, _) = - into_event_batch(req, Bytes::from(serde_json::to_vec(&json).unwrap()), schema).unwrap(); + let (_, rb, _) = into_event_batch( + req, + Bytes::from(serde_json::to_vec(&json).unwrap()), + schema, + None, + None, + ) + .unwrap(); assert_eq!(rb.num_rows(), 1); assert_eq!(rb.num_columns(), 3); @@ -393,6 +470,8 @@ mod tests { req, Bytes::from(serde_json::to_vec(&json).unwrap()), HashMap::default(), + None, + None ) .is_err()) } @@ -421,6 +500,8 @@ mod tests { req, Bytes::from(serde_json::to_vec(&json).unwrap()), HashMap::default(), + None, + None, ) .unwrap(); @@ -474,6 +555,8 @@ mod tests { req, Bytes::from(serde_json::to_vec(&json).unwrap()), HashMap::default(), + None, + None, ) .unwrap(); @@ -523,8 +606,14 @@ mod tests { ); let req = TestRequest::default().to_http_request(); - let (_, rb, _) = - into_event_batch(req, Bytes::from(serde_json::to_vec(&json).unwrap()), schema).unwrap(); + let (_, rb, _) = into_event_batch( + req, + Bytes::from(serde_json::to_vec(&json).unwrap()), + schema, + None, + None, + ) + .unwrap(); assert_eq!(rb.num_rows(), 3); assert_eq!(rb.num_columns(), 6); @@ -568,6 +657,8 @@ mod tests { req, Bytes::from(serde_json::to_vec(&json).unwrap()), HashMap::default(), + None, + None, ) .unwrap(); @@ -614,10 +705,14 @@ mod tests { .into_iter(), ); - assert!( - into_event_batch(req, Bytes::from(serde_json::to_vec(&json).unwrap()), schema,) - .is_err() - ); + assert!(into_event_batch( + req, + Bytes::from(serde_json::to_vec(&json).unwrap()), + schema, + None, + None + ) + .is_err()); } #[test] @@ -649,6 +744,8 @@ mod tests { req, Bytes::from(serde_json::to_vec(&json).unwrap()), HashMap::default(), + None, + None, ) .unwrap(); diff --git a/server/src/handlers/http/logstream.rs b/server/src/handlers/http/logstream.rs index ebcd41189..f7974220c 100644 --- a/server/src/handlers/http/logstream.rs +++ b/server/src/handlers/http/logstream.rs @@ -16,48 +16,120 @@ * */ -use std::fs; - +use self::error::{CreateStreamError, StreamError}; +use super::base_path_without_preceding_slash; +use super::cluster::fetch_stats_from_ingestors; +use super::cluster::utils::{merge_quried_stats, IngestionStats, QueriedStats, StorageStats}; +use crate::alerts::Alerts; +use crate::handlers::{STATIC_SCHEMA_FLAG, TIME_PARTITION_KEY}; +use crate::metadata::STREAM_INFO; +use crate::option::{Mode, CONFIG}; +use crate::static_schema::{convert_static_schema_to_arrow_schema, StaticSchema}; +use crate::storage::{retention::Retention, LogStream, StorageDir, StreamInfo}; +use crate::{ + catalog::{self, remove_manifest_from_snapshot}, + event, stats, +}; +use crate::{metadata, validator}; use actix_web::http::StatusCode; use actix_web::{web, HttpRequest, Responder}; +use arrow_schema::{Field, Schema}; +use bytes::Bytes; use chrono::Utc; +use itertools::Itertools; use serde_json::Value; - -use crate::alerts::Alerts; -use crate::metadata::STREAM_INFO; -use crate::option::CONFIG; -use crate::storage::retention::{self, Retention}; -use crate::storage::{LogStream, StorageDir}; -use crate::{event, stats}; -use crate::{metadata, validator}; - -use self::error::{CreateStreamError, StreamError}; +use std::collections::HashMap; +use std::fs; +use std::sync::Arc; pub async fn delete(req: HttpRequest) -> Result { let stream_name: String = req.match_info().get("logstream").unwrap().parse().unwrap(); - if !metadata::STREAM_INFO.stream_exists(&stream_name) { return Err(StreamError::StreamNotFound(stream_name)); } + match CONFIG.parseable.mode { + Mode::Query | Mode::All => { + let objectstore = CONFIG.storage().get_object_store(); + + objectstore.delete_stream(&stream_name).await?; + let stream_dir = StorageDir::new(&stream_name); + if fs::remove_dir_all(&stream_dir.data_path).is_err() { + log::warn!( + "failed to delete local data for stream {}. Clean {} manually", + stream_name, + stream_dir.data_path.to_string_lossy() + ) + } + + let ingestor_metadata = super::cluster::get_ingestor_info().await.map_err(|err| { + log::error!("Fatal: failed to get ingestor info: {:?}", err); + StreamError::from(err) + })?; + + for ingestor in ingestor_metadata { + let url = format!( + "{}{}/logstream/{}", + ingestor.domain_name, + base_path_without_preceding_slash(), + stream_name + ); + + // delete the stream + super::cluster::send_stream_delete_request(&url, ingestor.clone()).await?; + } + } + _ => {} + } - let objectstore = CONFIG.storage().get_object_store(); - objectstore.delete_stream(&stream_name).await?; metadata::STREAM_INFO.delete_stream(&stream_name); event::STREAM_WRITERS.delete_stream(&stream_name); stats::delete_stats(&stream_name, "json").unwrap_or_else(|e| { log::warn!("failed to delete stats for stream {}: {:?}", stream_name, e) }); - let stream_dir = StorageDir::new(&stream_name); - if fs::remove_dir_all(&stream_dir.data_path).is_err() { - log::warn!( - "failed to delete local data for stream {}. Clean {} manually", - stream_name, - stream_dir.data_path.to_string_lossy() - ) + Ok((format!("log stream {stream_name} deleted"), StatusCode::OK)) +} + +pub async fn retention_cleanup( + req: HttpRequest, + body: Bytes, +) -> Result { + let stream_name: String = req.match_info().get("logstream").unwrap().parse().unwrap(); + let storage = CONFIG.storage().get_object_store(); + if !metadata::STREAM_INFO.stream_exists(&stream_name) { + // here the ingest server has not found the stream + // so it should check if the stream exists in storage + let check = storage + .list_streams() + .await? + .iter() + .map(|stream| stream.name.clone()) + .contains(&stream_name); + + if !check { + log::error!("Stream {} not found", stream_name.clone()); + return Err(StreamError::StreamNotFound(stream_name.clone())); + } + metadata::STREAM_INFO + .upsert_stream_info( + &*storage, + LogStream { + name: stream_name.clone().to_owned(), + }, + ) + .await + .map_err(|_| StreamError::StreamNotFound(stream_name.clone()))?; + } + let date_list: Vec = serde_json::from_slice(&body).unwrap(); + let res = remove_manifest_from_snapshot(storage.clone(), &stream_name, date_list).await; + let mut first_event_at: Option = None; + if let Err(err) = res { + log::error!("Failed to update manifest list in the snapshot {err:?}") + } else { + first_event_at = res.unwrap(); } - Ok((format!("log stream {stream_name} deleted"), StatusCode::OK)) + Ok((first_event_at, StatusCode::OK)) } pub async fn list(_: HttpRequest) -> impl Responder { @@ -109,21 +181,60 @@ pub async fn get_alert(req: HttpRequest) -> Result Ok((web::Json(alerts), StatusCode::OK)) } -pub async fn put_stream(req: HttpRequest) -> Result { - let stream_name: String = req.match_info().get("logstream").unwrap().parse().unwrap(); +pub async fn put_stream(req: HttpRequest, body: Bytes) -> Result { + let time_partition = if let Some((_, time_partition_name)) = req + .headers() + .iter() + .find(|&(key, _)| key == TIME_PARTITION_KEY) + { + time_partition_name.to_str().unwrap() + } else { + "" + }; + let static_schema_flag = if let Some((_, static_schema_flag)) = req + .headers() + .iter() + .find(|&(key, _)| key == STATIC_SCHEMA_FLAG) + { + static_schema_flag.to_str().unwrap() + } else { + "" + }; + let stream_name: String = req.match_info().get("logstream").unwrap().parse().unwrap(); + let mut schema = Arc::new(Schema::empty()); if metadata::STREAM_INFO.stream_exists(&stream_name) { // Error if the log stream already exists return Err(StreamError::Custom { msg: format!( - "log stream {stream_name} already exists, please create a new log stream with unique name" + "logstream {stream_name} already exists, please create a new log stream with unique name" ), status: StatusCode::BAD_REQUEST, }); - } else { - create_stream(stream_name).await?; } + if !body.is_empty() && static_schema_flag == "true" { + let static_schema: StaticSchema = serde_json::from_slice(&body)?; + let parsed_schema = convert_static_schema_to_arrow_schema(static_schema); + if let Ok(parsed_schema) = parsed_schema { + schema = parsed_schema; + } else { + return Err(StreamError::Custom { + msg: format!("unable to commit static schema, logstream {stream_name} not created"), + status: StatusCode::BAD_REQUEST, + }); + } + } else if body.is_empty() && static_schema_flag == "true" { + return Err(StreamError::Custom { + msg: format!( + "please provide schema in the request body for static schema logstream {stream_name}" + ), + status: StatusCode::BAD_REQUEST, + }); + } + + create_stream(stream_name, time_partition, static_schema_flag, schema).await?; + Ok(("log stream created", StatusCode::OK)) } @@ -219,8 +330,6 @@ pub async fn put_retention( .put_retention(&stream_name, &retention) .await?; - retention::init_scheduler(&stream_name, retention); - Ok(( format!("set retention configuration for log stream {stream_name}"), StatusCode::OK, @@ -237,14 +346,66 @@ pub async fn put_enable_cache( req: HttpRequest, body: web::Json, ) -> Result { - let enable_cache = body.into_inner(); let stream_name: String = req.match_info().get("logstream").unwrap().parse().unwrap(); let storage = CONFIG.storage().get_object_store(); - if CONFIG.parseable.local_cache_path.is_none() { - return Err(StreamError::CacheNotEnabled(stream_name)); + match CONFIG.parseable.mode { + Mode::Query => { + if !metadata::STREAM_INFO.stream_exists(&stream_name) { + return Err(StreamError::StreamNotFound(stream_name)); + } + let ingestor_metadata = super::cluster::get_ingestor_info().await.map_err(|err| { + log::error!("Fatal: failed to get ingestor info: {:?}", err); + StreamError::from(err) + })?; + for ingestor in ingestor_metadata { + let url = format!( + "{}{}/logstream/{}/cache", + ingestor.domain_name, + base_path_without_preceding_slash(), + stream_name + ); + + super::cluster::sync_cache_with_ingestors(&url, ingestor.clone(), *body).await?; + } + } + Mode::Ingest => { + if CONFIG.parseable.local_cache_path.is_none() { + return Err(StreamError::CacheNotEnabled(stream_name)); + } + // here the ingest server has not found the stream + // so it should check if the stream exists in storage + let check = storage + .list_streams() + .await? + .iter() + .map(|stream| stream.name.clone()) + .contains(&stream_name); + + if !check { + log::error!("Stream {} not found", stream_name.clone()); + return Err(StreamError::StreamNotFound(stream_name.clone())); + } + metadata::STREAM_INFO + .upsert_stream_info( + &*storage, + LogStream { + name: stream_name.clone().to_owned(), + }, + ) + .await + .map_err(|_| StreamError::StreamNotFound(stream_name.clone()))?; + } + Mode::All => { + if !metadata::STREAM_INFO.stream_exists(&stream_name) { + return Err(StreamError::StreamNotFound(stream_name)); + } + if CONFIG.parseable.local_cache_path.is_none() { + return Err(StreamError::CacheNotEnabled(stream_name)); + } + } } - + let enable_cache = body.into_inner(); let mut stream_metadata = storage.get_stream_metadata(&stream_name).await?; stream_metadata.cache_enabled = enable_cache; storage @@ -268,25 +429,67 @@ pub async fn get_stats(req: HttpRequest) -> Result let stats = stats::get_current_stats(&stream_name, "json") .ok_or(StreamError::StreamNotFound(stream_name.clone()))?; + let ingestor_stats = if CONFIG.parseable.mode == Mode::Query { + Some(fetch_stats_from_ingestors(&stream_name).await?) + } else { + None + }; + + let hash_map = STREAM_INFO.read().expect("Readable"); + let stream_meta = &hash_map + .get(&stream_name) + .ok_or(StreamError::StreamNotFound(stream_name.clone()))?; + let time = Utc::now(); - let stats = serde_json::json!({ - "stream": stream_name, - "time": time, - "ingestion": { - "count": stats.events, - "size": format!("{} {}", stats.ingestion, "Bytes"), - "format": "json" - }, - "storage": { - "size": format!("{} {}", stats.storage, "Bytes"), - "format": "parquet" + let stats = match &stream_meta.first_event_at { + Some(_) => { + let ingestion_stats = IngestionStats::new( + stats.events, + format!("{} {}", stats.ingestion, "Bytes"), + "json", + ); + let storage_stats = + StorageStats::new(format!("{} {}", stats.storage, "Bytes"), "parquet"); + + QueriedStats::new(&stream_name, time, ingestion_stats, storage_stats) } - }); + + None => { + let ingestion_stats = IngestionStats::new( + stats.events, + format!("{} {}", stats.ingestion, "Bytes"), + "json", + ); + let storage_stats = + StorageStats::new(format!("{} {}", stats.storage, "Bytes"), "parquet"); + + QueriedStats::new(&stream_name, time, ingestion_stats, storage_stats) + } + }; + let stats = if let Some(mut ingestor_stats) = ingestor_stats { + ingestor_stats.push(stats); + merge_quried_stats(ingestor_stats) + } else { + stats + }; + + let stats = serde_json::to_value(stats)?; Ok((web::Json(stats), StatusCode::OK)) } +// Check if the first_event_at is empty +pub fn first_event_at_empty(stream_name: &str) -> bool { + let hash_map = STREAM_INFO.read().unwrap(); + if let Some(stream_info) = hash_map.get(stream_name) { + if let Some(first_event_at) = &stream_info.first_event_at { + return first_event_at.is_empty(); + } + } + true +} + fn remove_id_from_alerts(value: &mut Value) { if let Some(Value::Array(alerts)) = value.get_mut("alerts") { alerts @@ -298,20 +501,107 @@ fn remove_id_from_alerts(value: &mut Value) { } } -pub async fn create_stream(stream_name: String) -> Result<(), CreateStreamError> { +pub async fn create_stream( + stream_name: String, + time_partition: &str, + static_schema_flag: &str, + schema: Arc, +) -> Result<(), CreateStreamError> { // fail to proceed if invalid stream name validator::stream_name(&stream_name)?; // Proceed to create log stream if it doesn't exist let storage = CONFIG.storage().get_object_store(); - if let Err(err) = storage.create_stream(&stream_name).await { + if let Err(err) = storage + .create_stream( + &stream_name, + time_partition, + static_schema_flag, + schema.clone(), + ) + .await + { return Err(CreateStreamError::Storage { stream_name, err }); } - metadata::STREAM_INFO.add_stream(stream_name.to_string()); + + let stream_meta = CONFIG + .storage() + .get_object_store() + .get_stream_metadata(&stream_name) + .await; + let stream_meta = stream_meta.unwrap(); + let created_at = stream_meta.created_at; + let mut static_schema: HashMap> = HashMap::new(); + + for (field_name, field) in schema + .fields() + .iter() + .map(|field| (field.name().to_string(), field.clone())) + { + static_schema.insert(field_name, field); + } + + metadata::STREAM_INFO.add_stream( + stream_name.to_string(), + created_at, + time_partition.to_string(), + static_schema_flag.to_string(), + static_schema, + ); Ok(()) } +pub async fn get_stream_info(req: HttpRequest) -> Result { + let stream_name: String = req.match_info().get("logstream").unwrap().parse().unwrap(); + if !metadata::STREAM_INFO.stream_exists(&stream_name) { + return Err(StreamError::StreamNotFound(stream_name)); + } + + if first_event_at_empty(&stream_name) { + let store = CONFIG.storage().get_object_store(); + let dates: Vec = Vec::new(); + if let Ok(Some(first_event_at)) = catalog::get_first_event(store, &stream_name, dates).await + { + if let Err(err) = + metadata::STREAM_INFO.set_first_event_at(&stream_name, Some(first_event_at)) + { + log::error!( + "Failed to update first_event_at in streaminfo for stream {:?} {err:?}", + stream_name + ); + } + } + } + + let hash_map = STREAM_INFO.read().unwrap(); + let stream_meta = &hash_map + .get(&stream_name) + .ok_or(StreamError::StreamNotFound(stream_name.clone()))?; + + let stream_info: StreamInfo = StreamInfo { + created_at: stream_meta.created_at.clone(), + first_event_at: stream_meta.first_event_at.clone(), + time_partition: stream_meta.time_partition.clone(), + cache_enabled: stream_meta.cache_enabled, + static_schema_flag: stream_meta.static_schema_flag.clone(), + }; + + // get the other info from + + Ok((web::Json(stream_info), StatusCode::OK)) +} + +#[allow(unused)] +fn classify_json_error(kind: serde_json::error::Category) -> StatusCode { + match kind { + serde_json::error::Category::Io => StatusCode::INTERNAL_SERVER_ERROR, + serde_json::error::Category::Syntax => StatusCode::BAD_REQUEST, + serde_json::error::Category::Data => StatusCode::INTERNAL_SERVER_ERROR, + serde_json::error::Category::Eof => StatusCode::BAD_REQUEST, + } +} + pub mod error { use actix_web::http::header::ContentType; @@ -323,6 +613,9 @@ pub mod error { validator::error::{AlertValidationError, StreamNameValidationError}, }; + #[allow(unused)] + use super::classify_json_error; + #[derive(Debug, thiserror::Error)] pub enum CreateStreamError { #[error("Stream name validation failed due to {0}")] @@ -367,6 +660,12 @@ pub mod error { InvalidRetentionConfig(serde_json::Error), #[error("{msg}")] Custom { msg: String, status: StatusCode }, + #[error("Error: {0}")] + Anyhow(#[from] anyhow::Error), + #[error("Network Error: {0}")] + Network(#[from] reqwest::Error), + #[error("Could not deserialize into JSON object, {0}")] + SerdeError(#[from] serde_json::Error), } impl actix_web::ResponseError for StreamError { @@ -389,6 +688,11 @@ pub mod error { StreamError::InvalidAlert(_) => StatusCode::BAD_REQUEST, StreamError::InvalidAlertMessage(_, _) => StatusCode::BAD_REQUEST, StreamError::InvalidRetentionConfig(_) => StatusCode::BAD_REQUEST, + StreamError::SerdeError(_) => StatusCode::BAD_REQUEST, + StreamError::Anyhow(_) => StatusCode::INTERNAL_SERVER_ERROR, + StreamError::Network(err) => { + err.status().unwrap_or(StatusCode::INTERNAL_SERVER_ERROR) + } } } @@ -403,6 +707,10 @@ pub mod error { fn from(value: MetadataError) -> Self { match value { MetadataError::StreamMetaNotFound(s) => StreamError::StreamNotFound(s), + MetadataError::StandaloneWithDistributed(s) => StreamError::Custom { + msg: s, + status: StatusCode::INTERNAL_SERVER_ERROR, + }, } } } diff --git a/server/src/handlers/http/middleware.rs b/server/src/handlers/http/middleware.rs index 8c078ae65..427b2bcf6 100644 --- a/server/src/handlers/http/middleware.rs +++ b/server/src/handlers/http/middleware.rs @@ -27,9 +27,12 @@ use actix_web::{ }; use futures_util::future::LocalBoxFuture; -use crate::handlers::{ - AUTHORIZATION_KEY, KINESIS_COMMON_ATTRIBUTES_KEY, LOG_SOURCE_KEY, LOG_SOURCE_KINESIS, - STREAM_NAME_HEADER_KEY, +use crate::{ + handlers::{ + AUTHORIZATION_KEY, KINESIS_COMMON_ATTRIBUTES_KEY, LOG_SOURCE_KEY, LOG_SOURCE_KINESIS, + STREAM_NAME_HEADER_KEY, + }, + option::Mode, }; use crate::{ option::CONFIG, @@ -252,3 +255,106 @@ where }) } } + +/// ModeFilterMiddleware factory +pub struct ModeFilter; + +/// PathFilterMiddleware needs to implement Service trait +impl Transform for ModeFilter +where + S: Service, Error = Error>, + S::Future: 'static, + B: 'static, +{ + type Response = ServiceResponse; + type Error = Error; + type InitError = (); + type Transform = ModeFilterMiddleware; + type Future = Ready>; + + fn new_transform(&self, service: S) -> Self::Future { + ready(Ok(ModeFilterMiddleware { service })) + } +} + +/// Actual middleware service +pub struct ModeFilterMiddleware { + service: S, +} + +/// Impl the service trait for the middleware service +impl Service for ModeFilterMiddleware +where + S: Service, Error = Error>, + S::Future: 'static, + B: 'static, +{ + type Response = ServiceResponse; + type Error = Error; + type Future = LocalBoxFuture<'static, Result>; + + // impl poll_ready + actix_web::dev::forward_ready!(service); + + fn call(&self, req: ServiceRequest) -> Self::Future { + let path = req.path(); + let mode = &CONFIG.parseable.mode; + // change error messages based on mode + match mode { + Mode::Query => { + // In Query mode, only allows /ingest endpoint, and /logstream endpoint with GET method + let base_cond = path.split('/').any(|x| x == "ingest"); + let logstream_cond = + !(path.split('/').any(|x| x == "logstream") && req.method() == "GET"); + if base_cond { + Box::pin(async { + Err(actix_web::error::ErrorUnauthorized( + "Ingestion API cannot be accessed in Query Mode", + )) + }) + } else if logstream_cond { + Box::pin(async { + Err(actix_web::error::ErrorUnauthorized( + "Logstream cannot be changed in Query Mode", + )) + }) + } else { + let fut = self.service.call(req); + + Box::pin(async move { + let res = fut.await?; + Ok(res) + }) + } + } + + Mode::Ingest => { + let accessable_endpoints = ["ingest", "logstream", "liveness", "readiness"]; + let cond = path.split('/').any(|x| accessable_endpoints.contains(&x)); + if !cond { + Box::pin(async { + Err(actix_web::error::ErrorUnauthorized( + "Only Ingestion API can be accessed in Ingest Mode", + )) + }) + } else { + let fut = self.service.call(req); + + Box::pin(async move { + let res = fut.await?; + Ok(res) + }) + } + } + + Mode::All => { + let fut = self.service.call(req); + + Box::pin(async move { + let res = fut.await?; + Ok(res) + }) + } + } + } +} diff --git a/server/src/handlers/http/modal/ingest_server.rs b/server/src/handlers/http/modal/ingest_server.rs new file mode 100644 index 000000000..5e94d6cb2 --- /dev/null +++ b/server/src/handlers/http/modal/ingest_server.rs @@ -0,0 +1,365 @@ +/* + * Parseable Server (C) 2022 - 2024 Parseable, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +use crate::analytics; +use crate::banner; +use crate::handlers::http::logstream; +use crate::handlers::http::middleware::RouteExt; +use crate::localcache::LocalCacheManager; +use crate::metadata; +use crate::metrics; +use crate::rbac; +use crate::rbac::role::Action; +use crate::storage; +use crate::storage::object_storage::ingestor_metadata_path; +use crate::storage::object_storage::parseable_json_path; +use crate::storage::staging; +use crate::storage::ObjectStorageError; +use crate::sync; + +use super::server::Server; +use super::ssl_acceptor::get_ssl_acceptor; +use super::IngestorMetadata; +use super::OpenIdClient; +use super::ParseableServer; + +use actix_web::body::MessageBody; +use actix_web::Scope; +use actix_web::{web, App, HttpServer}; +use actix_web_prometheus::PrometheusMetrics; +use anyhow::anyhow; +use async_trait::async_trait; +use base64::Engine; +use itertools::Itertools; +use once_cell::sync::Lazy; +use relative_path::RelativePathBuf; + +use crate::{ + handlers::http::{base_path, cross_origin_config}, + option::CONFIG, +}; + +/// ! have to use a guard before using it +pub static INGESTOR_META: Lazy = + Lazy::new(|| staging::get_ingestor_info().expect("dir is readable and writeable")); + +#[derive(Default)] +pub struct IngestServer; + +#[async_trait(?Send)] +impl ParseableServer for IngestServer { + // we dont need oidc client here its just here to satisfy the trait + async fn start( + &self, + prometheus: PrometheusMetrics, + _oidc_client: Option, + ) -> anyhow::Result<()> { + // set the ingestor metadata + self.set_ingestor_metadata().await?; + + // get the ssl stuff + let ssl = get_ssl_acceptor( + &CONFIG.parseable.tls_cert_path, + &CONFIG.parseable.tls_key_path, + )?; + + // fn that creates the app + let create_app_fn = move || { + App::new() + .wrap(prometheus.clone()) + .configure(|config| IngestServer::configure_routes(config, None)) + .wrap(actix_web::middleware::Logger::default()) + .wrap(actix_web::middleware::Compress::default()) + .wrap(cross_origin_config()) + }; + + // concurrent workers equal to number of logical cores + let http_server = HttpServer::new(create_app_fn).workers(num_cpus::get()); + + if let Some(config) = ssl { + http_server + .bind_rustls(&CONFIG.parseable.address, config)? + .run() + .await?; + } else { + http_server.bind(&CONFIG.parseable.address)?.run().await?; + } + + Ok(()) + } + + /// implement the init method will just invoke the initialize method + async fn init(&self) -> anyhow::Result<()> { + self.validate()?; + + // check for querier state. Is it there, or was it there in the past + self.check_querier_state().await?; + // to get the .parseable.json file in staging + self.validate_credentials().await?; + + let metadata = storage::resolve_parseable_metadata().await?; + banner::print(&CONFIG, &metadata).await; + rbac::map::init(&metadata); + // set the info in the global metadata + metadata.set_global(); + + self.initialize().await + } + + fn validate(&self) -> anyhow::Result<()> { + if CONFIG.get_storage_mode_string() == "Local drive" { + return Err(anyhow::Error::msg( + // Error Message can be better + "Ingest Server cannot be started in local storage mode. Please start the server in a supported storage mode.", + )); + } + + Ok(()) + } +} + +impl IngestServer { + // configure the api routes + fn configure_routes(config: &mut web::ServiceConfig, _oidc_client: Option) { + config.service( + // Base path "{url}/api/v1" + web::scope(&base_path()) + .service(Server::get_ingest_factory()) + .service(Self::logstream_api()) + .service(Server::get_about_factory()) + .service(Self::analytics_factory()) + .service(Server::get_liveness_factory()) + .service(Server::get_readiness_factory()), + ); + } + + fn analytics_factory() -> Scope { + web::scope("/analytics").service( + // GET "/analytics" ==> Get analytics data + web::resource("").route( + web::get() + .to(analytics::get_analytics) + .authorize(Action::GetAnalytics), + ), + ) + } + + fn logstream_api() -> Scope { + web::scope("/logstream").service( + web::scope("/{logstream}") + .service( + web::resource("").route( + web::delete() + .to(logstream::delete) + .authorize_for_stream(Action::DeleteStream), + ), + ) + .service( + // GET "/logstream/{logstream}/info" ==> Get info for given log stream + web::resource("/info").route( + web::get() + .to(logstream::get_stream_info) + .authorize_for_stream(Action::GetStream), + ), + ) + .service( + // GET "/logstream/{logstream}/stats" ==> Get stats for given log stream + web::resource("/stats").route( + web::get() + .to(logstream::get_stats) + .authorize_for_stream(Action::GetStats), + ), + ) + .service( + web::resource("/cache") + // PUT "/logstream/{logstream}/cache" ==> Set retention for given logstream + .route( + web::put() + .to(logstream::put_enable_cache) + .authorize_for_stream(Action::PutCacheEnabled), + ) + // GET "/logstream/{logstream}/cache" ==> Get retention for given logstream + .route( + web::get() + .to(logstream::get_cache_enabled) + .authorize_for_stream(Action::GetCacheEnabled), + ), + ) + .service( + web::scope("/retention").service( + web::resource("/cleanup").route( + web::post() + .to(logstream::retention_cleanup) + .authorize_for_stream(Action::PutRetention), + ), + ), + ), + ) + } + + // create the ingestor metadata and put the .ingestor.json file in the object store + async fn set_ingestor_metadata(&self) -> anyhow::Result<()> { + let store = CONFIG.storage().get_object_store(); + + // find the meta file in staging if not generate new metadata + let resource = INGESTOR_META.clone(); + // use the id that was generated/found in the staging and + // generate the path for the object store + let path = ingestor_metadata_path(None); + + // we are considering that we can always get from object store + if let Ok(store_meta) = store.get_object(&path).await { + log::info!("Ingestor Metadata is present. Checking for updates"); + let mut store_data = serde_json::from_slice::(&store_meta) + .map_err(|_| anyhow!("IngestorMetadata was not parseable as valid json"))?; + + if store_data.domain_name != INGESTOR_META.domain_name { + log::info!("Ingestor Metadata update needed."); + log::info!( + "Old Domain Name: {}, New Domain Name: {}", + store_data.domain_name, + INGESTOR_META.domain_name + ); + store_data + .domain_name + .clone_from(&INGESTOR_META.domain_name); + store_data.port.clone_from(&INGESTOR_META.port); + + let resource = serde_json::to_string(&store_data)? + .try_into_bytes() + .map_err(|err| anyhow!(err))?; + + // if pushing to object store fails propagate the error + return store + .put_object(&path, resource) + .await + .map_err(|err| anyhow!(err)); + } + } + + let resource = serde_json::to_string(&resource)? + .try_into_bytes() + .map_err(|err| anyhow!(err))?; + + store.put_object(&path, resource).await?; + + Ok(()) + } + + // check for querier state. Is it there, or was it there in the past + // this should happen before the set the ingestor metadata + async fn check_querier_state(&self) -> anyhow::Result<(), ObjectStorageError> { + // how do we check for querier state? + // based on the work flow of the system, the querier will always need to start first + // i.e the querier will create the `.parseable.json` file + + let store = CONFIG.storage().get_object_store(); + let path = parseable_json_path(); + + match store.get_object(&path).await { + Ok(_) => Ok(()), + Err(_) => Err(ObjectStorageError::Custom( + "Query Server has not been started yet. Please start the querier server first." + .to_string(), + )), + } + } + + async fn validate_credentials(&self) -> anyhow::Result<()> { + // check if your creds match with others + let store = CONFIG.storage().get_object_store(); + let base_path = RelativePathBuf::from(""); + let ingestor_metadata = store + .get_objects( + Some(&base_path), + Box::new(|file_name| file_name.starts_with("ingestor")), + ) + .await? + .iter() + // this unwrap will most definateley shoot me in the foot later + .map(|x| serde_json::from_slice::(x).unwrap_or_default()) + .collect_vec(); + + if !ingestor_metadata.is_empty() { + let check = ingestor_metadata[0].token.clone(); + + let token = base64::prelude::BASE64_STANDARD.encode(format!( + "{}:{}", + CONFIG.parseable.username, CONFIG.parseable.password + )); + + let token = format!("Basic {}", token); + + if check != token { + log::error!("Credentials do not match with other ingestors. Please check your credentials and try again."); + return Err(anyhow::anyhow!("Credentials do not match with other ingestors. Please check your credentials and try again.")); + } + } + + Ok(()) + } + + async fn initialize(&self) -> anyhow::Result<()> { + // ! Undefined and Untested behaviour + if let Some(cache_manager) = LocalCacheManager::global() { + cache_manager + .validate(CONFIG.parseable.local_cache_size) + .await?; + }; + + let prometheus = metrics::build_metrics_handler(); + CONFIG.storage().register_store_metrics(&prometheus); + + let storage = CONFIG.storage().get_object_store(); + if let Err(err) = metadata::STREAM_INFO.load(&*storage).await { + log::warn!("could not populate local metadata. {:?}", err); + } + + metrics::fetch_stats_from_storage().await; + + let (localsync_handler, mut localsync_outbox, localsync_inbox) = sync::run_local_sync(); + let (mut remote_sync_handler, mut remote_sync_outbox, mut remote_sync_inbox) = + sync::object_store_sync(); + + let app = self.start(prometheus, CONFIG.parseable.openid.clone()); + tokio::pin!(app); + loop { + tokio::select! { + e = &mut app => { + // actix server finished .. stop other threads and stop the server + remote_sync_inbox.send(()).unwrap_or(()); + localsync_inbox.send(()).unwrap_or(()); + localsync_handler.join().unwrap_or(()); + remote_sync_handler.join().unwrap_or(()); + return e + }, + _ = &mut localsync_outbox => { + // crash the server if localsync fails for any reason + // panic!("Local Sync thread died. Server will fail now!") + return Err(anyhow::Error::msg("Failed to sync local data to drive. Please restart the Parseable server.\n\nJoin us on Parseable Slack if the issue persists after restart : https://launchpass.com/parseable")) + }, + _ = &mut remote_sync_outbox => { + // remote_sync failed, this is recoverable by just starting remote_sync thread again + remote_sync_handler.join().unwrap_or(()); + (remote_sync_handler, remote_sync_outbox, remote_sync_inbox) = sync::object_store_sync(); + } + + }; + } + } +} diff --git a/server/src/handlers/http/modal/mod.rs b/server/src/handlers/http/modal/mod.rs new file mode 100644 index 000000000..edd7bd3c3 --- /dev/null +++ b/server/src/handlers/http/modal/mod.rs @@ -0,0 +1,142 @@ +/* + * Parseable Server (C) 2022 - 2024 Parseable, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +pub mod ingest_server; +pub mod query_server; +pub mod server; +pub mod ssl_acceptor; + +use std::sync::Arc; + +use actix_web_prometheus::PrometheusMetrics; +use async_trait::async_trait; +use openid::Discovered; + +use crate::oidc; +use base64::Engine; +use serde::Deserialize; +use serde::Serialize; +pub type OpenIdClient = Arc>; + +// to be decided on what the Default version should be +pub const DEFAULT_VERSION: &str = "v3"; + +include!(concat!(env!("OUT_DIR"), "/generated.rs")); + +#[async_trait(?Send)] +pub trait ParseableServer { + // async fn validate(&self) -> Result<(), ObjectStorageError>; + + /// configure the server + async fn start( + &self, + prometheus: PrometheusMetrics, + oidc_client: Option, + ) -> anyhow::Result<()>; + + async fn init(&self) -> anyhow::Result<()>; + + fn validate(&self) -> anyhow::Result<()>; +} + +#[derive(Serialize, Debug, Deserialize, Default, Clone, Eq, PartialEq)] +pub struct IngestorMetadata { + pub version: String, + pub port: String, + pub domain_name: String, + pub bucket_name: String, + pub token: String, + pub ingestor_id: String, +} + +impl IngestorMetadata { + pub fn new( + port: String, + domain_name: String, + version: String, + bucket_name: String, + username: &str, + password: &str, + ingestor_id: String, + ) -> Self { + let token = base64::prelude::BASE64_STANDARD.encode(format!("{}:{}", username, password)); + + let token = format!("Basic {}", token); + + Self { + port, + domain_name, + version, + bucket_name, + token, + ingestor_id, + } + } + + pub fn get_ingestor_id(&self) -> String { + self.ingestor_id.clone() + } +} + +#[cfg(test)] +mod test { + use actix_web::body::MessageBody; + use rstest::rstest; + + use super::{IngestorMetadata, DEFAULT_VERSION}; + + #[rstest] + fn test_deserialize_resource() { + let lhs: IngestorMetadata = IngestorMetadata::new( + "8000".to_string(), + "https://localhost:8000".to_string(), + DEFAULT_VERSION.to_string(), + "somebucket".to_string(), + "admin", + "admin", + "ingestor_id".to_string(), + ); + + let rhs = serde_json::from_slice::(br#"{"version":"v3","port":"8000","domain_name":"https://localhost:8000","bucket_name":"somebucket","token":"Basic YWRtaW46YWRtaW4=", "ingestor_id": "ingestor_id"}"#).unwrap(); + + assert_eq!(rhs, lhs); + } + + #[rstest] + fn test_serialize_resource() { + let im = IngestorMetadata::new( + "8000".to_string(), + "https://localhost:8000".to_string(), + DEFAULT_VERSION.to_string(), + "somebucket".to_string(), + "admin", + "admin", + "ingestor_id".to_string(), + ); + + let lhs = serde_json::to_string(&im) + .unwrap() + .try_into_bytes() + .unwrap(); + let rhs = br#"{"version":"v3","port":"8000","domain_name":"https://localhost:8000","bucket_name":"somebucket","token":"Basic YWRtaW46YWRtaW4=","ingestor_id":"ingestor_id"}"# + .try_into_bytes() + .unwrap(); + + assert_eq!(lhs, rhs); + } +} diff --git a/server/src/handlers/http/modal/query_server.rs b/server/src/handlers/http/modal/query_server.rs new file mode 100644 index 000000000..ee258c0d3 --- /dev/null +++ b/server/src/handlers/http/modal/query_server.rs @@ -0,0 +1,191 @@ +/* + * Parseable Server (C) 2022 - 2024 Parseable, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +use crate::handlers::http::cluster; +use crate::handlers::http::middleware::RouteExt; +use crate::handlers::http::{base_path, cross_origin_config, API_BASE_PATH, API_VERSION}; + +use crate::rbac::role::Action; +use crate::{analytics, banner, metadata, metrics, migration, rbac, storage}; +use actix_web::web; +use actix_web::web::ServiceConfig; +use actix_web::{App, HttpServer}; +use async_trait::async_trait; +use std::sync::Arc; + +use crate::option::CONFIG; + +use super::server::Server; +use super::ssl_acceptor::get_ssl_acceptor; +use super::{OpenIdClient, ParseableServer}; + +#[derive(Default, Debug)] +pub struct QueryServer; + +#[async_trait(?Send)] +impl ParseableServer for QueryServer { + async fn start( + &self, + prometheus: actix_web_prometheus::PrometheusMetrics, + oidc_client: Option, + ) -> anyhow::Result<()> { + let oidc_client = match oidc_client { + Some(config) => { + let client = config + .connect(&format!("{API_BASE_PATH}/{API_VERSION}/o/code")) + .await?; + Some(Arc::new(client)) + } + + None => None, + }; + + let ssl = get_ssl_acceptor( + &CONFIG.parseable.tls_cert_path, + &CONFIG.parseable.tls_key_path, + )?; + + let create_app_fn = move || { + App::new() + .wrap(prometheus.clone()) + .configure(|config| QueryServer::configure_routes(config, oidc_client.clone())) + .wrap(actix_web::middleware::Logger::default()) + .wrap(actix_web::middleware::Compress::default()) + .wrap(cross_origin_config()) + }; + + // concurrent workers equal to number of cores on the cpu + let http_server = HttpServer::new(create_app_fn).workers(num_cpus::get()); + if let Some(config) = ssl { + http_server + .bind_rustls(&CONFIG.parseable.address, config)? + .run() + .await?; + } else { + http_server.bind(&CONFIG.parseable.address)?.run().await?; + } + + Ok(()) + } + + /// implementation of init should just invoke a call to initialize + async fn init(&self) -> anyhow::Result<()> { + self.validate()?; + migration::run_file_migration(&CONFIG).await?; + CONFIG.validate_storage().await?; + migration::run_metadata_migration(&CONFIG).await?; + let metadata = storage::resolve_parseable_metadata().await?; + banner::print(&CONFIG, &metadata).await; + // initialize the rbac map + rbac::map::init(&metadata); + // keep metadata info in mem + metadata.set_global(); + self.initialize().await + } + + fn validate(&self) -> anyhow::Result<()> { + if CONFIG.get_storage_mode_string() == "Local drive" { + return Err(anyhow::anyhow!( + "Query Server cannot be started in local storage mode. Please start the server in a supported storage mode.", + )); + } + + Ok(()) + } +} + +impl QueryServer { + // configure the api routes + fn configure_routes(config: &mut ServiceConfig, oidc_client: Option) { + config + .service( + web::scope(&base_path()) + // POST "/query" ==> Get results of the SQL query passed in request body + .service(Server::get_query_factory()) + .service(Server::get_liveness_factory()) + .service(Server::get_readiness_factory()) + .service(Server::get_about_factory()) + .service(Server::get_logstream_webscope()) + .service(Server::get_user_webscope()) + .service(Server::get_llm_webscope()) + .service(Server::get_oauth_webscope(oidc_client)) + .service(Server::get_user_role_webscope()) + .service(Self::get_cluster_web_scope()), + ) + .service(Server::get_generated()); + } + + fn get_cluster_web_scope() -> actix_web::Scope { + web::scope("/cluster") + .service( + // GET "/cluster/info" ==> Get info of the cluster + web::resource("/info").route( + web::get() + .to(cluster::get_cluster_info) + .authorize(Action::ListCluster), + ), + ) + // GET "/cluster/metrics" ==> Get metrics of the cluster + .service( + web::resource("/metrics").route( + web::get() + .to(cluster::get_cluster_metrics) + .authorize(Action::ListClusterMetrics), + ), + ) + // DELETE "/cluster/{ingestor_domain:port}" ==> Delete an ingestor from the cluster + .service( + web::scope("/{ingestor}").service( + web::resource("").route( + web::delete() + .to(cluster::remove_ingestor) + .authorize(Action::Deleteingestor), + ), + ), + ) + } + + /// initialize the server, run migrations as needed and start the server + async fn initialize(&self) -> anyhow::Result<()> { + let prometheus = metrics::build_metrics_handler(); + CONFIG.storage().register_store_metrics(&prometheus); + + migration::run_migration(&CONFIG).await?; + + let storage = CONFIG.storage().get_object_store(); + if let Err(e) = metadata::STREAM_INFO.load(&*storage).await { + log::warn!("could not populate local metadata. {:?}", e); + } + + // track all parquet files already in the data directory + storage::retention::load_retention_from_global(); + // load data from stats back to prometheus metrics + metrics::fetch_stats_from_storage().await; + + // all internal data structures populated now. + // start the analytics scheduler if enabled + if CONFIG.parseable.send_analytics { + analytics::init_analytics_scheduler()?; + } + + self.start(prometheus, CONFIG.parseable.openid.clone()) + .await?; + + Ok(()) + } +} diff --git a/server/src/handlers/http/modal/server.rs b/server/src/handlers/http/modal/server.rs new file mode 100644 index 000000000..d7508f5bc --- /dev/null +++ b/server/src/handlers/http/modal/server.rs @@ -0,0 +1,477 @@ +/* + * Parseable Server (C) 2022 - 2024 Parseable, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +use crate::analytics; +use crate::banner; +use crate::handlers; +use crate::handlers::http::about; +use crate::handlers::http::base_path; +use crate::handlers::http::health_check; +use crate::handlers::http::query; +use crate::handlers::http::API_BASE_PATH; +use crate::handlers::http::API_VERSION; +use crate::localcache::LocalCacheManager; +use crate::metadata; +use crate::metrics; +use crate::migration; +use crate::rbac; +use crate::storage; +use crate::sync; +use std::{fs::File, io::BufReader, sync::Arc}; + +use actix_web::web::resource; +use actix_web::Resource; +use actix_web::Scope; +use actix_web::{web, App, HttpServer}; +use actix_web_prometheus::PrometheusMetrics; +use actix_web_static_files::ResourceFiles; +use async_trait::async_trait; + +use rustls::{Certificate, PrivateKey, ServerConfig}; +use rustls_pemfile::{certs, pkcs8_private_keys}; + +use crate::{ + handlers::http::{ + self, cross_origin_config, ingest, llm, logstream, + middleware::{DisAllowRootUser, RouteExt}, + oidc, role, MAX_EVENT_PAYLOAD_SIZE, + }, + option::CONFIG, + rbac::role::Action, +}; + +// use super::generate; +use super::generate; +use super::OpenIdClient; +use super::ParseableServer; + +#[derive(Default)] +pub struct Server; + +#[async_trait(?Send)] +impl ParseableServer for Server { + async fn start( + &self, + prometheus: PrometheusMetrics, + oidc_client: Option, + ) -> anyhow::Result<()> { + let oidc_client = match oidc_client { + Some(config) => { + let client = config + .connect(&format!("{API_BASE_PATH}/{API_VERSION}/o/code")) + .await?; + Some(Arc::new(client)) + } + None => None, + }; + + let create_app_fn = move || { + App::new() + .wrap(prometheus.clone()) + .configure(|cfg| Server::configure_routes(cfg, oidc_client.clone())) + .wrap(actix_web::middleware::Logger::default()) + .wrap(actix_web::middleware::Compress::default()) + .wrap(cross_origin_config()) + }; + + let ssl_acceptor = match ( + &CONFIG.parseable.tls_cert_path, + &CONFIG.parseable.tls_key_path, + ) { + (Some(cert), Some(key)) => { + // init server config builder with safe defaults + let config = ServerConfig::builder() + .with_safe_defaults() + .with_no_client_auth(); + + // load TLS key/cert files + let cert_file = &mut BufReader::new(File::open(cert)?); + let key_file = &mut BufReader::new(File::open(key)?); + + // convert files to key/cert objects + let cert_chain = certs(cert_file)?.into_iter().map(Certificate).collect(); + + let mut keys: Vec = pkcs8_private_keys(key_file)? + .into_iter() + .map(PrivateKey) + .collect(); + + // exit if no keys could be parsed + if keys.is_empty() { + anyhow::bail!("Could not locate PKCS 8 private keys."); + } + + let server_config = config.with_single_cert(cert_chain, keys.remove(0))?; + + Some(server_config) + } + (_, _) => None, + }; + + // concurrent workers equal to number of cores on the cpu + let http_server = HttpServer::new(create_app_fn).workers(num_cpus::get()); + if let Some(config) = ssl_acceptor { + http_server + .bind_rustls(&CONFIG.parseable.address, config)? + .run() + .await?; + } else { + http_server.bind(&CONFIG.parseable.address)?.run().await?; + } + + Ok(()) + } + + /// implementation of init should just invoke a call to initialize + async fn init(&self) -> anyhow::Result<()> { + self.validate()?; + migration::run_file_migration(&CONFIG).await?; + CONFIG.validate_storage().await?; + migration::run_metadata_migration(&CONFIG).await?; + let metadata = storage::resolve_parseable_metadata().await?; + banner::print(&CONFIG, &metadata).await; + rbac::map::init(&metadata); + metadata.set_global(); + self.initialize().await + } + + fn validate(&self) -> anyhow::Result<()> { + Ok(()) + } +} + +impl Server { + fn configure_routes(config: &mut web::ServiceConfig, oidc_client: Option) { + // there might be a bug in the configure routes method + config + .service( + web::scope(&base_path()) + // POST "/query" ==> Get results of the SQL query passed in request body + .service(Self::get_query_factory()) + .service(Self::get_ingest_factory()) + .service(Self::get_liveness_factory()) + .service(Self::get_readiness_factory()) + .service(Self::get_about_factory()) + .service(Self::get_logstream_webscope()) + .service(Self::get_user_webscope()) + .service(Self::get_llm_webscope()) + .service(Self::get_oauth_webscope(oidc_client)) + .service(Self::get_user_role_webscope()), + ) + .service(Self::get_generated()); + } + + // get the query factory + pub fn get_query_factory() -> Resource { + web::resource("/query").route(web::post().to(query::query).authorize(Action::Query)) + } + + // get the logstream web scope + pub fn get_logstream_webscope() -> Scope { + web::scope("/logstream") + .service( + // GET "/logstream" ==> Get list of all Log Streams on the server + web::resource("") + .route(web::get().to(logstream::list).authorize(Action::ListStream)), + ) + .service( + web::scope("/{logstream}") + .service( + web::resource("") + // PUT "/logstream/{logstream}" ==> Create log stream + .route( + web::put() + .to(logstream::put_stream) + .authorize_for_stream(Action::CreateStream), + ) + // POST "/logstream/{logstream}" ==> Post logs to given log stream + .route( + web::post() + .to(ingest::post_event) + .authorize_for_stream(Action::Ingest), + ) + // DELETE "/logstream/{logstream}" ==> Delete log stream + .route( + web::delete() + .to(logstream::delete) + .authorize_for_stream(Action::DeleteStream), + ) + .app_data(web::PayloadConfig::default().limit(MAX_EVENT_PAYLOAD_SIZE)), + ) + .service( + // GET "/logstream/{logstream}/info" ==> Get info for given log stream + web::resource("/info").route( + web::get() + .to(logstream::get_stream_info) + .authorize_for_stream(Action::GetStream), + ), + ) + .service( + web::resource("/alert") + // PUT "/logstream/{logstream}/alert" ==> Set alert for given log stream + .route( + web::put() + .to(logstream::put_alert) + .authorize_for_stream(Action::PutAlert), + ) + // GET "/logstream/{logstream}/alert" ==> Get alert for given log stream + .route( + web::get() + .to(logstream::get_alert) + .authorize_for_stream(Action::GetAlert), + ), + ) + .service( + // GET "/logstream/{logstream}/schema" ==> Get schema for given log stream + web::resource("/schema").route( + web::get() + .to(logstream::schema) + .authorize_for_stream(Action::GetSchema), + ), + ) + .service( + // GET "/logstream/{logstream}/stats" ==> Get stats for given log stream + web::resource("/stats").route( + web::get() + .to(logstream::get_stats) + .authorize_for_stream(Action::GetStats), + ), + ) + .service( + web::resource("/retention") + // PUT "/logstream/{logstream}/retention" ==> Set retention for given logstream + .route( + web::put() + .to(logstream::put_retention) + .authorize_for_stream(Action::PutRetention), + ) + // GET "/logstream/{logstream}/retention" ==> Get retention for given logstream + .route( + web::get() + .to(logstream::get_retention) + .authorize_for_stream(Action::GetRetention), + ), + ) + .service( + web::resource("/cache") + // PUT "/logstream/{logstream}/cache" ==> Set retention for given logstream + .route( + web::put() + .to(logstream::put_enable_cache) + .authorize_for_stream(Action::PutCacheEnabled), + ) + // GET "/logstream/{logstream}/cache" ==> Get retention for given logstream + .route( + web::get() + .to(logstream::get_cache_enabled) + .authorize_for_stream(Action::GetCacheEnabled), + ), + ), + ) + } + + // get the factory for the ingest route + pub fn get_ingest_factory() -> Resource { + web::resource("/ingest") + .route( + web::post() + .to(ingest::ingest) + .authorize_for_stream(Action::Ingest), + ) + .app_data(web::PayloadConfig::default().limit(MAX_EVENT_PAYLOAD_SIZE)) + } + + // get the oauth webscope + pub fn get_oauth_webscope(oidc_client: Option) -> Scope { + let oauth = web::scope("/o") + .service(resource("/login").route(web::get().to(oidc::login))) + .service(resource("/logout").route(web::get().to(oidc::logout))) + .service(resource("/code").route(web::get().to(oidc::reply_login))); + + if let Some(client) = oidc_client { + oauth.app_data(web::Data::from(client)) + } else { + oauth + } + } + + // get the role webscope + pub fn get_user_role_webscope() -> Scope { + web::scope("/role") + // GET Role List + .service(resource("").route(web::get().to(role::list).authorize(Action::ListRole))) + .service( + // PUT and GET Default Role + resource("/default") + .route(web::put().to(role::put_default).authorize(Action::PutRole)) + .route(web::get().to(role::get_default).authorize(Action::GetRole)), + ) + .service( + // PUT, GET, DELETE Roles + resource("/{name}") + .route(web::put().to(role::put).authorize(Action::PutRole)) + .route(web::delete().to(role::delete).authorize(Action::DeleteRole)) + .route(web::get().to(role::get).authorize(Action::GetRole)), + ) + } + + // get the user webscope + pub fn get_user_webscope() -> Scope { + web::scope("/user") + .service( + web::resource("") + // GET /user => List all users + .route( + web::get() + .to(http::rbac::list_users) + .authorize(Action::ListUser), + ), + ) + .service( + web::resource("/{username}") + // PUT /user/{username} => Create a new user + .route( + web::post() + .to(http::rbac::post_user) + .authorize(Action::PutUser), + ) + // DELETE /user/{username} => Delete a user + .route( + web::delete() + .to(http::rbac::delete_user) + .authorize(Action::DeleteUser), + ) + .wrap(DisAllowRootUser), + ) + .service( + web::resource("/{username}/role") + // PUT /user/{username}/roles => Put roles for user + .route( + web::put() + .to(http::rbac::put_role) + .authorize(Action::PutUserRoles) + .wrap(DisAllowRootUser), + ) + .route( + web::get() + .to(http::rbac::get_role) + .authorize_for_user(Action::GetUserRoles), + ), + ) + .service( + web::resource("/{username}/generate-new-password") + // POST /user/{username}/generate-new-password => reset password for this user + .route( + web::post() + .to(http::rbac::post_gen_password) + .authorize(Action::PutUser) + .wrap(DisAllowRootUser), + ), + ) + } + + // get the llm webscope + pub fn get_llm_webscope() -> Scope { + web::scope("/llm").service( + web::resource("").route( + web::post() + .to(llm::make_llm_request) + .authorize(Action::QueryLLM), + ), + ) + } + + // get the live check + // GET "/liveness" ==> Liveness check as per https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-a-liveness-command + pub fn get_liveness_factory() -> Resource { + web::resource("/liveness").route(web::get().to(health_check::liveness)) + } + + // get the readiness check + // GET "/readiness" ==> Readiness check as per https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-readiness-probes + pub fn get_readiness_factory() -> Resource { + web::resource("/readiness").route(web::get().to(health_check::readiness)) + } + + // get the about factory + pub fn get_about_factory() -> Resource { + web::resource("/about").route(web::get().to(about::about).authorize(Action::GetAbout)) + } + + // GET "/" ==> Serve the static frontend directory + pub fn get_generated() -> ResourceFiles { + ResourceFiles::new("/", generate()).resolve_not_found_to_root() + } + + async fn initialize(&self) -> anyhow::Result<()> { + if let Some(cache_manager) = LocalCacheManager::global() { + cache_manager + .validate(CONFIG.parseable.local_cache_size) + .await?; + }; + + let prometheus = metrics::build_metrics_handler(); + CONFIG.storage().register_store_metrics(&prometheus); + + migration::run_migration(&CONFIG).await?; + + let storage = CONFIG.storage().get_object_store(); + if let Err(err) = metadata::STREAM_INFO.load(&*storage).await { + log::warn!("could not populate local metadata. {:?}", err); + } + + storage::retention::load_retention_from_global(); + metrics::fetch_stats_from_storage().await; + + let (localsync_handler, mut localsync_outbox, localsync_inbox) = sync::run_local_sync(); + let (mut remote_sync_handler, mut remote_sync_outbox, mut remote_sync_inbox) = + sync::object_store_sync(); + + if CONFIG.parseable.send_analytics { + analytics::init_analytics_scheduler()?; + } + + tokio::spawn(handlers::livetail::server()); + + let app = self.start(prometheus, CONFIG.parseable.openid.clone()); + + tokio::pin!(app); + loop { + tokio::select! { + e = &mut app => { + // actix server finished .. stop other threads and stop the server + remote_sync_inbox.send(()).unwrap_or(()); + localsync_inbox.send(()).unwrap_or(()); + localsync_handler.join().unwrap_or(()); + remote_sync_handler.join().unwrap_or(()); + return e + }, + _ = &mut localsync_outbox => { + // crash the server if localsync fails for any reason + // panic!("Local Sync thread died. Server will fail now!") + return Err(anyhow::Error::msg("Failed to sync local data to drive. Please restart the Parseable server.\n\nJoin us on Parseable Slack if the issue persists after restart : https://launchpass.com/parseable")) + }, + _ = &mut remote_sync_outbox => { + // remote_sync failed, this is recoverable by just starting remote_sync thread again + remote_sync_handler.join().unwrap_or(()); + (remote_sync_handler, remote_sync_outbox, remote_sync_inbox) = sync::object_store_sync(); + } + }; + } + } +} diff --git a/server/src/handlers/http/modal/ssl_acceptor.rs b/server/src/handlers/http/modal/ssl_acceptor.rs new file mode 100644 index 000000000..6b51113b1 --- /dev/null +++ b/server/src/handlers/http/modal/ssl_acceptor.rs @@ -0,0 +1,54 @@ +/* + * Parseable Server (C) 2022 - 2024 Parseable, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +use std::{fs::File, io::BufReader, path::PathBuf}; + +use itertools::Itertools; +use rustls::{Certificate, PrivateKey, ServerConfig}; +use rustls_pemfile::{certs, pkcs8_private_keys}; + +pub fn get_ssl_acceptor( + tls_cert: &Option, + tls_key: &Option, +) -> anyhow::Result> { + match (tls_cert, tls_key) { + (Some(cert), Some(key)) => { + let server_config = ServerConfig::builder() + .with_safe_defaults() + .with_no_client_auth(); + + let cert_file = &mut BufReader::new(File::open(cert)?); + let key_file = &mut BufReader::new(File::open(key)?); + let cert_chain = certs(cert_file)?.into_iter().map(Certificate).collect_vec(); + + let mut keys = pkcs8_private_keys(key_file)? + .into_iter() + .map(PrivateKey) + .collect_vec(); + + if keys.is_empty() { + anyhow::bail!("Could not locate PKCS 8 private keys."); + } + + Ok(Some( + server_config.with_single_cert(cert_chain, keys.remove(0))?, + )) + } + (_, _) => Ok(None), + } +} diff --git a/server/src/handlers/http/oidc.rs b/server/src/handlers/http/oidc.rs index 92c711c58..bcc749225 100644 --- a/server/src/handlers/http/oidc.rs +++ b/server/src/handlers/http/oidc.rs @@ -139,24 +139,35 @@ pub async fn reply_login( return Ok(HttpResponse::Unauthorized().finish()); }; let username = user_info - .sub + .name .clone() .expect("OIDC provider did not return a sub which is currently required."); let user_info: user::UserInfo = user_info.into(); - - let group: HashSet = claims + let mut group: HashSet = claims .other .remove("groups") .map(serde_json::from_value) .transpose()? - .unwrap_or_else(|| { - DEFAULT_ROLE - .lock() - .unwrap() - .clone() - .map(|role| HashSet::from([role])) - .unwrap_or_default() - }); + .unwrap_or_default(); + let metadata = get_metadata().await?; + let mut role_exists = false; + for role in metadata.roles.iter() { + let role_name = role.0; + for group_name in group.iter() { + if group_name.eq(role_name) { + role_exists = true; + break; + } + } + } + if !role_exists || group.is_empty() { + group = DEFAULT_ROLE + .lock() + .unwrap() + .clone() + .map(|role| HashSet::from([role])) + .unwrap_or_default(); + } // User may not exist // create a new one depending on state of metadata diff --git a/server/src/handlers/http/otel.rs b/server/src/handlers/http/otel.rs new file mode 100644 index 000000000..83a6404f8 --- /dev/null +++ b/server/src/handlers/http/otel.rs @@ -0,0 +1,281 @@ +/* + * Parseable Server (C) 2022 - 2024 Parseable, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +use bytes::Bytes; +use serde_json::Value; +mod proto; +use crate::handlers::http::otel::proto::logs::v1::LogRecordFlags; +use crate::handlers::http::otel::proto::logs::v1::LogsData; +use crate::handlers::http::otel::proto::logs::v1::SeverityNumber; +use std::collections::BTreeMap; +// Value can be one of types - String, Bool, Int, Double, ArrayValue, AnyValue, KeyValueList, Byte +fn collect_json_from_any_value( + key: &String, + value: super::otel::proto::common::v1::Value, +) -> BTreeMap { + let mut value_json: BTreeMap = BTreeMap::new(); + if value.str_val.is_some() { + value_json.insert( + key.to_string(), + Value::String(value.str_val.as_ref().unwrap().to_owned()), + ); + } + if value.bool_val.is_some() { + value_json.insert(key.to_string(), Value::Bool(value.bool_val.unwrap())); + } + if value.int_val.is_some() { + value_json.insert( + key.to_string(), + Value::Number(serde_json::Number::from(value.int_val.unwrap())), + ); + } + if value.double_val.is_some() { + value_json.insert( + key.to_string(), + Value::Number(serde_json::Number::from_f64(value.double_val.unwrap()).unwrap()), + ); + } + + //ArrayValue is a vector of AnyValue + //traverse by recursively calling the same function + if value.array_val.is_some() { + let array_val = value.array_val.as_ref().unwrap(); + let values = &array_val.values; + + for value in values { + let value = &value.value; + value_json = collect_json_from_any_value(key, value.clone()); + } + } + + //KeyValueList is a vector of KeyValue + //traverse through each element in the vector + if value.kv_list_val.is_some() { + let kv_list_val = value.kv_list_val.unwrap(); + for key_value in kv_list_val.values { + let value = key_value.value; + value_json = collect_json_from_values(&value, key); + } + } + if value.bytes_val.is_some() { + value_json.insert( + key.to_string(), + Value::String(value.bytes_val.as_ref().unwrap().to_owned()), + ); + } + + value_json +} + +//traverse through Value by calling function ollect_json_from_any_value +fn collect_json_from_values( + values: &Option, + key: &String, +) -> BTreeMap { + let mut value_json: BTreeMap = BTreeMap::new(); + + for value in values.iter() { + value_json = collect_json_from_any_value(key, value.clone()); + } + + value_json +} + +pub fn flatten_otel_logs(body: &Bytes) -> Vec> { + let mut vec_otel_json: Vec> = Vec::new(); + let body_str = std::str::from_utf8(body).unwrap(); + + let message: LogsData = serde_json::from_str(body_str).unwrap(); + for records in message.resource_logs.iter() { + for record in records.iter() { + let mut otel_json: BTreeMap = BTreeMap::new(); + for resource in record.resource.iter() { + let attributes = &resource.attributes; + for attributes in attributes.iter() { + for attribute in attributes { + let key = &attribute.key; + let value = &attribute.value; + let value_json = + collect_json_from_values(value, &format!("resource_{}", key)); + for key in value_json.keys() { + otel_json.insert(key.to_owned(), value_json[key].to_owned()); + } + } + } + if resource.dropped_attributes_count > 0 { + otel_json.insert( + "resource_dropped_attributes_count".to_string(), + Value::Number(serde_json::Number::from(resource.dropped_attributes_count)), + ); + } + } + + for scope_logs in record.scope_logs.iter() { + for scope_log in scope_logs.iter() { + for instrumentation_scope in scope_log.scope.iter() { + if !instrumentation_scope.name.is_empty() { + otel_json.insert( + "instrumentation_scope_name".to_string(), + Value::String(instrumentation_scope.name.to_string()), + ); + } + if !instrumentation_scope.version.is_empty() { + otel_json.insert( + "instrumentation_scope_version".to_string(), + Value::String(instrumentation_scope.version.to_string()), + ); + } + let attributes = &instrumentation_scope.attributes; + for attributes in attributes.iter() { + for attribute in attributes { + let key = &attribute.key; + let value = &attribute.value; + let value_json = collect_json_from_values( + value, + &format!("instrumentation_scope_{}", key), + ); + for key in value_json.keys() { + otel_json.insert(key.to_owned(), value_json[key].to_owned()); + } + } + } + if instrumentation_scope.dropped_attributes_count > 0 { + otel_json.insert( + "instrumentation_scope_dropped_attributes_count".to_string(), + Value::Number(serde_json::Number::from( + instrumentation_scope.dropped_attributes_count, + )), + ); + } + } + + for log_record in scope_log.log_records.iter() { + let mut log_record_json: BTreeMap = BTreeMap::new(); + if !log_record.time_unix_nano > 0 { + log_record_json.insert( + "time_unix_nano".to_string(), + Value::String(log_record.time_unix_nano.to_string()), + ); + } + if !log_record.observed_time_unix_nano > 0 { + log_record_json.insert( + "observed_time_unix_nano".to_string(), + Value::String(log_record.observed_time_unix_nano.to_string()), + ); + } + if log_record.severity_number > 0 { + let severity_number: i32 = log_record.severity_number; + log_record_json.insert( + "severity_number".to_string(), + Value::Number(serde_json::Number::from(severity_number)), + ); + if log_record.severity_text.is_empty() { + log_record_json.insert( + "severity_text".to_string(), + Value::String( + SeverityNumber::as_str_name(severity_number).to_string(), + ), + ); + } + } + if !log_record.severity_text.is_empty() { + log_record_json.insert( + "severity_text".to_string(), + Value::String(log_record.severity_text.to_string()), + ); + } + + if log_record.body.is_some() { + let body = &log_record.body; + let body_json = collect_json_from_values(body, &"body".to_string()); + for key in body_json.keys() { + log_record_json.insert(key.to_owned(), body_json[key].to_owned()); + } + } + + for attributes in log_record.attributes.iter() { + for attribute in attributes { + let key = &attribute.key; + let value = &attribute.value; + let value_json = + collect_json_from_values(value, &format!("log_record_{}", key)); + for key in value_json.keys() { + log_record_json + .insert(key.to_owned(), value_json[key].to_owned()); + } + } + } + + if log_record.dropped_attributes_count > 0 { + log_record_json.insert( + "log_record_dropped_attributes_count".to_string(), + Value::Number(serde_json::Number::from( + log_record.dropped_attributes_count, + )), + ); + } + + if log_record.flags > 0 { + let flags: u32 = log_record.flags; + log_record_json.insert( + "flags_number".to_string(), + Value::Number(serde_json::Number::from(flags)), + ); + log_record_json.insert( + "flags_string".to_string(), + Value::String(LogRecordFlags::as_str_name(flags).to_string()), + ); + } + + if !log_record.span_id.is_empty() { + log_record_json.insert( + "span_id".to_string(), + Value::String(log_record.span_id.to_string()), + ); + } + + if !log_record.trace_id.is_empty() { + log_record_json.insert( + "trace_id".to_string(), + Value::String(log_record.trace_id.to_string()), + ); + } + for key in log_record_json.keys() { + otel_json.insert(key.to_owned(), log_record_json[key].to_owned()); + } + vec_otel_json.push(otel_json.clone()); + } + + if !scope_log.schema_url.is_empty() { + otel_json.insert( + "scope_log_schema_url".to_string(), + Value::String(scope_log.schema_url.to_string()), + ); + } + } + } + if !record.schema_url.is_empty() { + otel_json.insert( + "resource_schema_url".to_string(), + Value::String(record.schema_url.to_string()), + ); + } + } + } + vec_otel_json +} diff --git a/server/src/handlers/http/otel/opentelemetry.proto.common.v1.rs b/server/src/handlers/http/otel/opentelemetry.proto.common.v1.rs new file mode 100644 index 000000000..ca2ea99bc --- /dev/null +++ b/server/src/handlers/http/otel/opentelemetry.proto.common.v1.rs @@ -0,0 +1,95 @@ +/* + * Parseable Server (C) 2022 - 2024 Parseable, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + + // This file was generated by protoc-gen-rust-protobuf. The file was edited after the generation. + // All the repeated fields were changed to Option> and the `oneof` fields were changed to Option. + + use serde::{Deserialize, Serialize}; + #[derive(Serialize, Deserialize, Debug, Clone)] + /// AnyValue is used to represent any type of attribute value. AnyValue may contain a + /// primitive value such as a string or integer or it may contain an arbitrary nested + /// object containing arrays, key-value lists and primitives. + pub struct AnyValue { + /// The value is one of the listed fields. It is valid for all values to be unspecified + /// in which case this AnyValue is considered to be "empty". + pub value: Value, + } + + #[derive(Serialize, Deserialize, Debug, Clone)] + pub struct Value { + #[serde(rename = "stringValue")] + pub str_val: Option, + #[serde(rename = "boolValue")] + pub bool_val: Option, + #[serde(rename = "intValue")] + pub int_val: Option, + #[serde(rename = "doubleValue")] + pub double_val: Option, + #[serde(rename = "arrayValue")] + pub array_val: Option, + #[serde(rename = "keyVauleList")] + pub kv_list_val: Option, + #[serde(rename = "bytesValue")] + pub bytes_val: Option, + } + + #[derive(Serialize, Deserialize, Debug, Clone)] + /// ArrayValue is a list of AnyValue messages. We need ArrayValue as a message + /// since oneof in AnyValue does not allow repeated fields. + pub struct ArrayValue { + /// Array of values. The array may be empty (contain 0 elements). + pub values: Vec, + } + + #[derive(Serialize, Deserialize, Debug, Clone)] + /// KeyValueList is a list of KeyValue messages. We need KeyValueList as a message + /// since `oneof` in AnyValue does not allow repeated fields. Everywhere else where we need + /// a list of KeyValue messages (e.g. in Span) we use `repeated KeyValue` directly to + /// avoid unnecessary extra wrapping (which slows down the protocol). The 2 approaches + /// are semantically equivalent. + pub struct KeyValueList { + /// A collection of key/value pairs of key-value pairs. The list may be empty (may + /// contain 0 elements). + /// The keys MUST be unique (it is not allowed to have more than one + /// value with the same key). + pub values: Vec, + } + + #[derive(Serialize, Deserialize, Debug, Clone)] + /// KeyValue is a key-value pair that is used to store Span attributes, Link + /// attributes, etc. + pub struct KeyValue { + pub key: String, + pub value: Option, + } + + #[derive(Serialize, Deserialize, Debug)] + /// InstrumentationScope is a message representing the instrumentation scope information + /// such as the fully qualified name and version. + pub struct InstrumentationScope { + /// An empty instrumentation scope name means the name is unknown. + pub name: String, + pub version: String, + /// Additional attributes that describe the scope. \[Optional\]. + /// Attribute keys MUST be unique (it is not allowed to have more than one + /// attribute with the same key). + pub attributes: Option>, + #[serde(rename = "droppedAttributesCount")] + pub dropped_attributes_count: u32, + } + \ No newline at end of file diff --git a/server/src/handlers/http/otel/opentelemetry.proto.logs.v1.rs b/server/src/handlers/http/otel/opentelemetry.proto.logs.v1.rs new file mode 100644 index 000000000..318f85fbf --- /dev/null +++ b/server/src/handlers/http/otel/opentelemetry.proto.logs.v1.rs @@ -0,0 +1,291 @@ +/* + * Parseable Server (C) 2022 - 2024 Parseable, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +// This file was generated by protoc-gen-rust-protobuf. The file was edited after the generation. + // All the repeated fields were changed to Option>. + + use crate::handlers::http::otel::proto::common::v1::InstrumentationScope; + use crate::handlers::http::otel::proto::common::v1::KeyValue; + use crate::handlers::http::otel::proto::common::v1::Value; + use crate::handlers::http::otel::proto::resource::v1::Resource; + use serde::{Deserialize, Serialize}; + + #[derive(Serialize, Deserialize, Debug)] + /// LogsData represents the logs data that can be stored in a persistent storage, + /// OR can be embedded by other protocols that transfer OTLP logs data but do not + /// implement the OTLP protocol. + /// + /// The main difference between this message and collector protocol is that + /// in this message there will not be any "control" or "metadata" specific to + /// OTLP protocol. + /// + /// When new fields are added into this message, the OTLP request MUST be updated + /// as well. + pub struct LogsData { + /// An array of ResourceLogs. + /// For data coming from a single resource this array will typically contain + /// one element. Intermediary nodes that receive data from multiple origins + /// typically batch the data before forwarding further and in that case this + /// array will contain multiple elements. + #[serde(rename = "resourceLogs")] + pub resource_logs: Option>, + } + + #[derive(Serialize, Deserialize, Debug)] + /// A collection of ScopeLogs from a Resource. + pub struct ResourceLogs { + /// The resource for the logs in this message. + /// If this field is not set then resource info is unknown. + pub resource: Option, + /// A list of ScopeLogs that originate from a resource. + #[serde(rename = "scopeLogs")] + pub scope_logs: Option>, + /// This schema_url applies to the data in the "resource" field. It does not apply + /// to the data in the "scope_logs" field which have their own schema_url field. + #[serde(rename = "schemaUrl")] + pub schema_url: String, + } + + #[derive(Serialize, Deserialize, Debug)] + /// A collection of Logs produced by a Scope. + pub struct ScopeLogs { + /// The instrumentation scope information for the logs in this message. + /// Semantically when InstrumentationScope isn't set, it is equivalent with + /// an empty instrumentation scope name (unknown). + pub scope: Option, + /// A list of log records. + #[serde(rename = "logRecords")] + pub log_records: Vec, + /// This schema_url applies to all logs in the "logs" field. + #[serde(rename = "schemaUrl")] + pub schema_url: String, + } + + #[derive(Serialize, Deserialize, Debug)] + /// A log record according to OpenTelemetry Log Data Model: + /// + pub struct LogRecord { + /// time_unix_nano is the time when the event occurred. + /// Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January 1970. + /// Value of 0 indicates unknown or missing timestamp. + #[serde(rename = "timeUnixNano")] + pub time_unix_nano: u64, + /// Time when the event was observed by the collection system. + /// For events that originate in OpenTelemetry (e.g. using OpenTelemetry Logging SDK) + /// this timestamp is typically set at the generation time and is equal to Timestamp. + /// For events originating externally and collected by OpenTelemetry (e.g. using + /// Collector) this is the time when OpenTelemetry's code observed the event measured + /// by the clock of the OpenTelemetry code. This field MUST be set once the event is + /// observed by OpenTelemetry. + /// + /// For converting OpenTelemetry log data to formats that support only one timestamp or + /// when receiving OpenTelemetry log data by recipients that support only one timestamp + /// internally the following logic is recommended: + /// - Use time_unix_nano if it is present, otherwise use observed_time_unix_nano. + /// + /// Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January 1970. + /// Value of 0 indicates unknown or missing timestamp. + #[serde(rename = "observedTimeUnixNano")] + pub observed_time_unix_nano: u64, + /// Numerical value of the severity, normalized to values described in Log Data Model. + /// \[Optional\]. + #[serde(rename = "severityNumber")] + pub severity_number: i32, + /// The severity text (also known as log level). The original string representation as + /// it is known at the source. \[Optional\]. + #[serde(rename = "severityText")] + pub severity_text: String, + /// A value containing the body of the log record. Can be for example a human-readable + /// string message (including multi-line) describing the event in a free form or it can + /// be a structured data composed of arrays and maps of other values. \[Optional\]. + pub body: Option, + /// Additional attributes that describe the specific event occurrence. \[Optional\]. + /// Attribute keys MUST be unique (it is not allowed to have more than one + /// attribute with the same key). + pub attributes: Option>, + #[serde(rename = "droppedAttributesCount")] + pub dropped_attributes_count: u32, + /// Flags, a bit field. 8 least significant bits are the trace flags as + /// defined in W3C Trace Context specification. 24 most significant bits are reserved + /// and must be set to 0. Readers must not assume that 24 most significant bits + /// will be zero and must correctly mask the bits when reading 8-bit trace flag (use + /// flags & LOG_RECORD_FLAGS_TRACE_FLAGS_MASK). \[Optional\]. + pub flags: u32, + /// A unique identifier for a trace. All logs from the same trace share + /// the same `trace_id`. The ID is a 16-byte array. An ID with all zeroes OR + /// of length other than 16 bytes is considered invalid (empty string in OTLP/JSON + /// is zero-length and thus is also invalid). + /// + /// This field is optional. + /// + /// The receivers SHOULD assume that the log record is not associated with a + /// trace if any of the following is true: + /// - the field is not present, + /// - the field contains an invalid value. + #[serde(rename = "traceId")] + pub trace_id: String, + /// A unique identifier for a span within a trace, assigned when the span + /// is created. The ID is an 8-byte array. An ID with all zeroes OR of length + /// other than 8 bytes is considered invalid (empty string in OTLP/JSON + /// is zero-length and thus is also invalid). + /// + /// This field is optional. If the sender specifies a valid span_id then it SHOULD also + /// specify a valid trace_id. + /// + /// The receivers SHOULD assume that the log record is not associated with a + /// span if any of the following is true: + /// - the field is not present, + /// - the field contains an invalid value. + #[serde(rename = "spanId")] + pub span_id: String, + } + /// Possible values for LogRecord.SeverityNumber. + #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] + #[repr(i32)] + pub enum SeverityNumber { + /// UNSPECIFIED is the default SeverityNumber, it MUST NOT be used. + Unspecified = 0, + Trace = 1, + Trace2 = 2, + Trace3 = 3, + Trace4 = 4, + Debug = 5, + Debug2 = 6, + Debug3 = 7, + Debug4 = 8, + Info = 9, + Info2 = 10, + Info3 = 11, + Info4 = 12, + Warn = 13, + Warn2 = 14, + Warn3 = 15, + Warn4 = 16, + Error = 17, + Error2 = 18, + Error3 = 19, + Error4 = 20, + Fatal = 21, + Fatal2 = 22, + Fatal3 = 23, + Fatal4 = 24, + } + impl SeverityNumber { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(severity_number: i32) -> &'static str { + match severity_number { + 0 => "SEVERITY_NUMBER_UNSPECIFIED", + 1 => "SEVERITY_NUMBER_TRACE", + 2 => "SEVERITY_NUMBER_TRACE2", + 3 => "SEVERITY_NUMBER_TRACE3", + 4 => "SEVERITY_NUMBER_TRACE4", + 5 => "SEVERITY_NUMBER_DEBUG", + 6 => "SEVERITY_NUMBER_DEBUG2", + 7 => "SEVERITY_NUMBER_DEBUG3", + 8 => "SEVERITY_NUMBER_DEBUG4", + 9 => "SEVERITY_NUMBER_INFO", + 10 => "SEVERITY_NUMBER_INFO2", + 11 => "SEVERITY_NUMBER_INFO3", + 12 => "SEVERITY_NUMBER_INFO4", + 13 => "SEVERITY_NUMBER_WARN", + 14 => "SEVERITY_NUMBER_WARN2", + 15 => "SEVERITY_NUMBER_WARN3", + 16 => "SEVERITY_NUMBER_WARN4", + 17 => "SEVERITY_NUMBER_ERROR", + 18 => "SEVERITY_NUMBER_ERROR2", + 19 => "SEVERITY_NUMBER_ERROR3", + 20 => "SEVERITY_NUMBER_ERROR4", + 21 => "SEVERITY_NUMBER_FATAL", + 22 => "SEVERITY_NUMBER_FATAL2", + 23 => "SEVERITY_NUMBER_FATAL3", + 24 => "SEVERITY_NUMBER_FATAL4", + _ => "Invalid severity number", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "SEVERITY_NUMBER_UNSPECIFIED" => Some(Self::Unspecified), + "SEVERITY_NUMBER_TRACE" => Some(Self::Trace), + "SEVERITY_NUMBER_TRACE2" => Some(Self::Trace2), + "SEVERITY_NUMBER_TRACE3" => Some(Self::Trace3), + "SEVERITY_NUMBER_TRACE4" => Some(Self::Trace4), + "SEVERITY_NUMBER_DEBUG" => Some(Self::Debug), + "SEVERITY_NUMBER_DEBUG2" => Some(Self::Debug2), + "SEVERITY_NUMBER_DEBUG3" => Some(Self::Debug3), + "SEVERITY_NUMBER_DEBUG4" => Some(Self::Debug4), + "SEVERITY_NUMBER_INFO" => Some(Self::Info), + "SEVERITY_NUMBER_INFO2" => Some(Self::Info2), + "SEVERITY_NUMBER_INFO3" => Some(Self::Info3), + "SEVERITY_NUMBER_INFO4" => Some(Self::Info4), + "SEVERITY_NUMBER_WARN" => Some(Self::Warn), + "SEVERITY_NUMBER_WARN2" => Some(Self::Warn2), + "SEVERITY_NUMBER_WARN3" => Some(Self::Warn3), + "SEVERITY_NUMBER_WARN4" => Some(Self::Warn4), + "SEVERITY_NUMBER_ERROR" => Some(Self::Error), + "SEVERITY_NUMBER_ERROR2" => Some(Self::Error2), + "SEVERITY_NUMBER_ERROR3" => Some(Self::Error3), + "SEVERITY_NUMBER_ERROR4" => Some(Self::Error4), + "SEVERITY_NUMBER_FATAL" => Some(Self::Fatal), + "SEVERITY_NUMBER_FATAL2" => Some(Self::Fatal2), + "SEVERITY_NUMBER_FATAL3" => Some(Self::Fatal3), + "SEVERITY_NUMBER_FATAL4" => Some(Self::Fatal4), + _ => None, + } + } + } + /// LogRecordFlags is defined as a protobuf 'uint32' type and is to be used as + /// bit-fields. Each non-zero value defined in this enum is a bit-mask. + /// To extract the bit-field, for example, use an expression like: + /// + /// (logRecord.flags & LOG_RECORD_FLAGS_TRACE_FLAGS_MASK) + /// + #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, ::prost::Enumeration)] + #[repr(i32)] + pub enum LogRecordFlags { + /// The zero value for the enum. Should not be used for comparisons. + /// Instead use bitwise "and" with the appropriate mask as shown above. + DoNotUse = 0, + /// Bits 0-7 are used for trace flags. + TraceFlagsMask = 255, + } + impl LogRecordFlags { + /// String value of the enum field names used in the ProtoBuf definition. + /// + /// The values are not transformed in any way and thus are considered stable + /// (if the ProtoBuf definition does not change) and safe for programmatic use. + pub fn as_str_name(flag: u32) -> &'static str { + match flag { + 0 => "LOG_RECORD_FLAGS_DO_NOT_USE", + 255 => "LOG_RECORD_FLAGS_TRACE_FLAGS_MASK", + _ => "Invalid flag", + } + } + /// Creates an enum from field names used in the ProtoBuf definition. + pub fn from_str_name(value: &str) -> ::core::option::Option { + match value { + "LOG_RECORD_FLAGS_DO_NOT_USE" => Some(Self::DoNotUse), + "LOG_RECORD_FLAGS_TRACE_FLAGS_MASK" => Some(Self::TraceFlagsMask), + _ => None, + } + } + } + \ No newline at end of file diff --git a/server/src/handlers/http/otel/opentelemetry.proto.resource.v1.rs b/server/src/handlers/http/otel/opentelemetry.proto.resource.v1.rs new file mode 100644 index 000000000..1d72275b0 --- /dev/null +++ b/server/src/handlers/http/otel/opentelemetry.proto.resource.v1.rs @@ -0,0 +1,38 @@ +/* + * Parseable Server (C) 2022 - 2024 Parseable, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + // This file was generated by protoc-gen-rust-protobuf. The file was edited after the generation. + // All the repeated fields were changed to Option> + + use crate::handlers::http::otel::proto::common::v1::KeyValue; + use serde::{Deserialize, Serialize}; + + #[derive(Serialize, Deserialize, Debug)] + /// Resource information. + pub struct Resource { + /// Set of attributes that describe the resource. + /// Attribute keys MUST be unique (it is not allowed to have more than one + /// attribute with the same key). + #[serde(rename = "attributes")] + pub attributes: Option>, + /// dropped_attributes_count is the number of dropped attributes. If the value is 0, then + /// no attributes were dropped. + + #[serde(rename = "droppedAttributesCount")] + pub dropped_attributes_count: u32, + } + \ No newline at end of file diff --git a/server/src/handlers/http/otel/opentelemetry/proto/README.md b/server/src/handlers/http/otel/opentelemetry/proto/README.md new file mode 100644 index 000000000..d0281330e --- /dev/null +++ b/server/src/handlers/http/otel/opentelemetry/proto/README.md @@ -0,0 +1,2 @@ +The following protobuf definitions are vendored from: +https://github.com/open-telemetry/opentelemetry-proto/tree/v1.0.0/opentelemetry/proto diff --git a/server/src/handlers/http/otel/opentelemetry/proto/common/v1/common.proto b/server/src/handlers/http/otel/opentelemetry/proto/common/v1/common.proto new file mode 100644 index 000000000..f7ee8f265 --- /dev/null +++ b/server/src/handlers/http/otel/opentelemetry/proto/common/v1/common.proto @@ -0,0 +1,81 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.common.v1; + +option csharp_namespace = "OpenTelemetry.Proto.Common.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.common.v1"; +option java_outer_classname = "CommonProto"; +option go_package = "go.opentelemetry.io/proto/otlp/common/v1"; + +// AnyValue is used to represent any type of attribute value. AnyValue may contain a +// primitive value such as a string or integer or it may contain an arbitrary nested +// object containing arrays, key-value lists and primitives. +message AnyValue { + // The value is one of the listed fields. It is valid for all values to be unspecified + // in which case this AnyValue is considered to be "empty". + oneof value { + string string_value = 1; + bool bool_value = 2; + int64 int_value = 3; + double double_value = 4; + ArrayValue array_value = 5; + KeyValueList kvlist_value = 6; + bytes bytes_value = 7; + } +} + +// ArrayValue is a list of AnyValue messages. We need ArrayValue as a message +// since oneof in AnyValue does not allow repeated fields. +message ArrayValue { + // Array of values. The array may be empty (contain 0 elements). + repeated AnyValue values = 1; +} + +// KeyValueList is a list of KeyValue messages. We need KeyValueList as a message +// since `oneof` in AnyValue does not allow repeated fields. Everywhere else where we need +// a list of KeyValue messages (e.g. in Span) we use `repeated KeyValue` directly to +// avoid unnecessary extra wrapping (which slows down the protocol). The 2 approaches +// are semantically equivalent. +message KeyValueList { + // A collection of key/value pairs of key-value pairs. The list may be empty (may + // contain 0 elements). + // The keys MUST be unique (it is not allowed to have more than one + // value with the same key). + repeated KeyValue values = 1; +} + +// KeyValue is a key-value pair that is used to store Span attributes, Link +// attributes, etc. +message KeyValue { + string key = 1; + AnyValue value = 2; +} + +// InstrumentationScope is a message representing the instrumentation scope information +// such as the fully qualified name and version. +message InstrumentationScope { + // An empty instrumentation scope name means the name is unknown. + string name = 1; + string version = 2; + + // Additional attributes that describe the scope. [Optional]. + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated KeyValue attributes = 3; + uint32 dropped_attributes_count = 4; +} diff --git a/server/src/handlers/http/otel/opentelemetry/proto/logs/v1/logs.proto b/server/src/handlers/http/otel/opentelemetry/proto/logs/v1/logs.proto new file mode 100644 index 000000000..0b4b64972 --- /dev/null +++ b/server/src/handlers/http/otel/opentelemetry/proto/logs/v1/logs.proto @@ -0,0 +1,203 @@ +// Copyright 2020, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.logs.v1; + +import "opentelemetry/proto/common/v1/common.proto"; +import "opentelemetry/proto/resource/v1/resource.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Logs.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.logs.v1"; +option java_outer_classname = "LogsProto"; +option go_package = "go.opentelemetry.io/proto/otlp/logs/v1"; + +// LogsData represents the logs data that can be stored in a persistent storage, +// OR can be embedded by other protocols that transfer OTLP logs data but do not +// implement the OTLP protocol. +// +// The main difference between this message and collector protocol is that +// in this message there will not be any "control" or "metadata" specific to +// OTLP protocol. +// +// When new fields are added into this message, the OTLP request MUST be updated +// as well. +message LogsData { + // An array of ResourceLogs. + // For data coming from a single resource this array will typically contain + // one element. Intermediary nodes that receive data from multiple origins + // typically batch the data before forwarding further and in that case this + // array will contain multiple elements. + repeated ResourceLogs resource_logs = 1; +} + +// A collection of ScopeLogs from a Resource. +message ResourceLogs { + reserved 1000; + + // The resource for the logs in this message. + // If this field is not set then resource info is unknown. + opentelemetry.proto.resource.v1.Resource resource = 1; + + // A list of ScopeLogs that originate from a resource. + repeated ScopeLogs scope_logs = 2; + + // This schema_url applies to the data in the "resource" field. It does not apply + // to the data in the "scope_logs" field which have their own schema_url field. + string schema_url = 3; +} + +// A collection of Logs produced by a Scope. +message ScopeLogs { + // The instrumentation scope information for the logs in this message. + // Semantically when InstrumentationScope isn't set, it is equivalent with + // an empty instrumentation scope name (unknown). + opentelemetry.proto.common.v1.InstrumentationScope scope = 1; + + // A list of log records. + repeated LogRecord log_records = 2; + + // This schema_url applies to all logs in the "logs" field. + string schema_url = 3; +} + +// Possible values for LogRecord.SeverityNumber. +enum SeverityNumber { + // UNSPECIFIED is the default SeverityNumber, it MUST NOT be used. + SEVERITY_NUMBER_UNSPECIFIED = 0; + SEVERITY_NUMBER_TRACE = 1; + SEVERITY_NUMBER_TRACE2 = 2; + SEVERITY_NUMBER_TRACE3 = 3; + SEVERITY_NUMBER_TRACE4 = 4; + SEVERITY_NUMBER_DEBUG = 5; + SEVERITY_NUMBER_DEBUG2 = 6; + SEVERITY_NUMBER_DEBUG3 = 7; + SEVERITY_NUMBER_DEBUG4 = 8; + SEVERITY_NUMBER_INFO = 9; + SEVERITY_NUMBER_INFO2 = 10; + SEVERITY_NUMBER_INFO3 = 11; + SEVERITY_NUMBER_INFO4 = 12; + SEVERITY_NUMBER_WARN = 13; + SEVERITY_NUMBER_WARN2 = 14; + SEVERITY_NUMBER_WARN3 = 15; + SEVERITY_NUMBER_WARN4 = 16; + SEVERITY_NUMBER_ERROR = 17; + SEVERITY_NUMBER_ERROR2 = 18; + SEVERITY_NUMBER_ERROR3 = 19; + SEVERITY_NUMBER_ERROR4 = 20; + SEVERITY_NUMBER_FATAL = 21; + SEVERITY_NUMBER_FATAL2 = 22; + SEVERITY_NUMBER_FATAL3 = 23; + SEVERITY_NUMBER_FATAL4 = 24; +} + +// LogRecordFlags is defined as a protobuf 'uint32' type and is to be used as +// bit-fields. Each non-zero value defined in this enum is a bit-mask. +// To extract the bit-field, for example, use an expression like: +// +// (logRecord.flags & LOG_RECORD_FLAGS_TRACE_FLAGS_MASK) +// +enum LogRecordFlags { + // The zero value for the enum. Should not be used for comparisons. + // Instead use bitwise "and" with the appropriate mask as shown above. + LOG_RECORD_FLAGS_DO_NOT_USE = 0; + + // Bits 0-7 are used for trace flags. + LOG_RECORD_FLAGS_TRACE_FLAGS_MASK = 0x000000FF; + + // Bits 8-31 are reserved for future use. +} + +// A log record according to OpenTelemetry Log Data Model: +// https://github.com/open-telemetry/oteps/blob/main/text/logs/0097-log-data-model.md +message LogRecord { + reserved 4; + + // time_unix_nano is the time when the event occurred. + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January 1970. + // Value of 0 indicates unknown or missing timestamp. + fixed64 time_unix_nano = 1; + + // Time when the event was observed by the collection system. + // For events that originate in OpenTelemetry (e.g. using OpenTelemetry Logging SDK) + // this timestamp is typically set at the generation time and is equal to Timestamp. + // For events originating externally and collected by OpenTelemetry (e.g. using + // Collector) this is the time when OpenTelemetry's code observed the event measured + // by the clock of the OpenTelemetry code. This field MUST be set once the event is + // observed by OpenTelemetry. + // + // For converting OpenTelemetry log data to formats that support only one timestamp or + // when receiving OpenTelemetry log data by recipients that support only one timestamp + // internally the following logic is recommended: + // - Use time_unix_nano if it is present, otherwise use observed_time_unix_nano. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January 1970. + // Value of 0 indicates unknown or missing timestamp. + fixed64 observed_time_unix_nano = 11; + + // Numerical value of the severity, normalized to values described in Log Data Model. + // [Optional]. + SeverityNumber severity_number = 2; + + // The severity text (also known as log level). The original string representation as + // it is known at the source. [Optional]. + string severity_text = 3; + + // A value containing the body of the log record. Can be for example a human-readable + // string message (including multi-line) describing the event in a free form or it can + // be a structured data composed of arrays and maps of other values. [Optional]. + opentelemetry.proto.common.v1.AnyValue body = 5; + + // Additional attributes that describe the specific event occurrence. [Optional]. + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 6; + uint32 dropped_attributes_count = 7; + + // Flags, a bit field. 8 least significant bits are the trace flags as + // defined in W3C Trace Context specification. 24 most significant bits are reserved + // and must be set to 0. Readers must not assume that 24 most significant bits + // will be zero and must correctly mask the bits when reading 8-bit trace flag (use + // flags & LOG_RECORD_FLAGS_TRACE_FLAGS_MASK). [Optional]. + fixed32 flags = 8; + + // A unique identifier for a trace. All logs from the same trace share + // the same `trace_id`. The ID is a 16-byte array. An ID with all zeroes OR + // of length other than 16 bytes is considered invalid (empty string in OTLP/JSON + // is zero-length and thus is also invalid). + // + // This field is optional. + // + // The receivers SHOULD assume that the log record is not associated with a + // trace if any of the following is true: + // - the field is not present, + // - the field contains an invalid value. + bytes trace_id = 9; + + // A unique identifier for a span within a trace, assigned when the span + // is created. The ID is an 8-byte array. An ID with all zeroes OR of length + // other than 8 bytes is considered invalid (empty string in OTLP/JSON + // is zero-length and thus is also invalid). + // + // This field is optional. If the sender specifies a valid span_id then it SHOULD also + // specify a valid trace_id. + // + // The receivers SHOULD assume that the log record is not associated with a + // span if any of the following is true: + // - the field is not present, + // - the field contains an invalid value. + bytes span_id = 10; +} diff --git a/server/src/handlers/http/otel/opentelemetry/proto/resource/v1/resource.proto b/server/src/handlers/http/otel/opentelemetry/proto/resource/v1/resource.proto new file mode 100644 index 000000000..6637560bc --- /dev/null +++ b/server/src/handlers/http/otel/opentelemetry/proto/resource/v1/resource.proto @@ -0,0 +1,37 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.resource.v1; + +import "opentelemetry/proto/common/v1/common.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Resource.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.resource.v1"; +option java_outer_classname = "ResourceProto"; +option go_package = "go.opentelemetry.io/proto/otlp/resource/v1"; + +// Resource information. +message Resource { + // Set of attributes that describe the resource. + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 1; + + // dropped_attributes_count is the number of dropped attributes. If the value is 0, then + // no attributes were dropped. + uint32 dropped_attributes_count = 2; +} diff --git a/server/src/handlers/http/otel/proto.rs b/server/src/handlers/http/otel/proto.rs new file mode 100644 index 000000000..9322bfcc5 --- /dev/null +++ b/server/src/handlers/http/otel/proto.rs @@ -0,0 +1,38 @@ +/* + * Parseable Server (C) 2022 - 2024 Parseable, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +/// Common types used across all event types. +pub mod common { + pub mod v1 { + include!("opentelemetry.proto.common.v1.rs"); + } +} + +/// Generated types used for logs. +pub mod logs { + pub mod v1 { + include!("opentelemetry.proto.logs.v1.rs"); + } +} + +/// Generated types used in resources. +pub mod resource { + pub mod v1 { + include!("opentelemetry.proto.resource.v1.rs"); + } +} diff --git a/server/src/handlers/http/query.rs b/server/src/handlers/http/query.rs index af5120c49..26f29b592 100644 --- a/server/src/handlers/http/query.rs +++ b/server/src/handlers/http/query.rs @@ -20,24 +20,33 @@ use actix_web::http::header::ContentType; use actix_web::web::{self, Json}; use actix_web::{FromRequest, HttpRequest, Responder}; use chrono::{DateTime, Utc}; +use datafusion::common::tree_node::TreeNode; use datafusion::error::DataFusionError; use datafusion::execution::context::SessionState; use futures_util::Future; use http::StatusCode; use std::collections::HashMap; use std::pin::Pin; +use std::sync::Arc; use std::time::Instant; +use crate::event::error::EventError; +use crate::handlers::http::fetch_schema; + +use crate::event::commit_schema; use crate::metrics::QUERY_EXECUTE_TIME; +use crate::option::{Mode, CONFIG}; use crate::query::error::ExecuteError; -use crate::query::QUERY_SESSION; +use crate::query::{TableScanVisitor, QUERY_SESSION}; use crate::rbac::role::{Action, Permission}; use crate::rbac::Users; use crate::response::QueryResponse; +use crate::storage::object_storage::commit_schema_to_storage; +use crate::storage::ObjectStorageError; use crate::utils::actix::extract_session_key_from_req; /// Query Request through http endpoint. -#[derive(Debug, serde::Deserialize)] +#[derive(Debug, serde::Deserialize, serde::Serialize)] #[serde(rename_all = "camelCase")] pub struct Query { query: String, @@ -52,11 +61,37 @@ pub struct Query { } pub async fn query(req: HttpRequest, query_request: Query) -> Result { - let creds = extract_session_key_from_req(&req).expect("expects basic auth"); - let permissions = Users.get_permissions(&creds); let session_state = QUERY_SESSION.state(); + + // get the logical plan and extract the table name + let raw_logical_plan = session_state + .create_logical_plan(&query_request.query) + .await?; + // create a visitor to extract the table name + let mut visitor = TableScanVisitor::default(); + let _ = raw_logical_plan.visit(&mut visitor); + let table_name = visitor + .into_inner() + .pop() + .ok_or(QueryError::MalformedQuery( + "No table found from sql".to_string(), + ))?; + + if CONFIG.parseable.mode == Mode::Query { + if let Ok(new_schema) = fetch_schema(&table_name).await { + // commit schema merges the schema internally and updates the schema in storage. + commit_schema_to_storage(&table_name, new_schema.clone()) + .await + .map_err(QueryError::ObjectStorage)?; + commit_schema(&table_name, Arc::new(new_schema)).map_err(QueryError::EventError)?; + } + } + let mut query = into_query(&query_request, &session_state).await?; + let creds = extract_session_key_from_req(&req).expect("expects basic auth"); + let permissions = Users.get_permissions(&creds); + // check authorization of this query if it references physical table; let table_name = query.table_name(); if let Some(ref table) = table_name { @@ -94,7 +129,7 @@ pub async fn query(req: HttpRequest, query_request: Query) -> Result Option { + if query.query.is_empty() { + return None; + } + + if query.start_time.is_empty() { + return None; + } + + if query.end_time.is_empty() { + return None; + } + + let end_time: DateTime = if query.end_time == "now" { + Utc::now() + } else { + DateTime::parse_from_rfc3339(&query.end_time) + .ok()? + .with_timezone(&Utc) + }; + + let start_time = end_time - chrono::Duration::minutes(1); + // when transforming the query, the ingestors are forced to return an array of values + let q = Query { + query: query.query.clone(), + fields: false, + filter_tags: query.filter_tags.clone(), + send_null: query.send_null, + start_time: start_time.to_rfc3339(), + end_time: end_time.to_rfc3339(), + }; + + Some(q) +} + #[derive(Debug, thiserror::Error)] pub enum QueryError { #[error("Query cannot be empty")] @@ -207,6 +279,12 @@ pub enum QueryError { Datafusion(#[from] DataFusionError), #[error("Execution Error: {0}")] Execute(#[from] ExecuteError), + #[error("ObjectStorage Error: {0}")] + ObjectStorage(#[from] ObjectStorageError), + #[error("Evern Error: {0}")] + EventError(#[from] EventError), + #[error("Error: {0}")] + MalformedQuery(String), } impl actix_web::ResponseError for QueryError { diff --git a/server/src/handlers/livetail.rs b/server/src/handlers/livetail.rs index de9970a1e..0f9711c78 100644 --- a/server/src/handlers/livetail.rs +++ b/server/src/handlers/livetail.rs @@ -264,13 +264,17 @@ fn extract_basic_auth(header: &MetadataMap) -> Option { } fn extract_cookie(header: &MetadataMap) -> Option { - let cookies = header - .get("Cookie") - .and_then(|value| value.to_str().ok()) - .map(Cookie::split_parse)?; + // extract the cookie from the request + let cookies = header.get_all("cookie"); + let cookies: Vec<_> = cookies + .iter() + .filter_map(|value| value.to_str().ok()) + .flat_map(Cookie::split_parse) + .map(|value| value.expect("cookie is parseable")) + .collect(); cookies - .flatten() + .into_iter() .find(|cookie| cookie.name() == SESSION_COOKIE_NAME) } diff --git a/server/src/main.rs b/server/src/main.rs index 954ed6ddd..04d6ed8b7 100644 --- a/server/src/main.rs +++ b/server/src/main.rs @@ -16,20 +16,12 @@ * */ -use clokwerk::{AsyncScheduler, Job, Scheduler, TimeUnits}; -use thread_priority::{ThreadBuilder, ThreadPriority}; -use tokio::sync::oneshot; -use tokio::sync::oneshot::error::TryRecvError; - -use std::panic::{catch_unwind, AssertUnwindSafe}; -use std::thread::{self, JoinHandle}; -use std::time::Duration; - mod about; mod alerts; mod analytics; mod banner; mod catalog; +mod cli; mod event; mod handlers; mod livetail; @@ -42,162 +34,40 @@ mod option; mod query; mod rbac; mod response; +mod static_schema; mod stats; mod storage; +mod sync; mod utils; mod validator; -use option::CONFIG; +use std::sync::Arc; + +use handlers::http::modal::ParseableServer; +use option::{Mode, CONFIG}; -use crate::localcache::LocalCacheManager; +use crate::{ + handlers::http::modal::{ + ingest_server::IngestServer, query_server::QueryServer, server::Server, + }, + // localcache::LocalCacheManager, +}; +pub const STORAGE_UPLOAD_INTERVAL: u32 = 60; #[actix_web::main] async fn main() -> anyhow::Result<()> { env_logger::init(); - let storage = CONFIG.storage().get_object_store(); - migration::run_metadata_migration(&CONFIG).await?; - let metadata = storage::resolve_parseable_metadata().await?; - CONFIG.validate_staging()?; - banner::print(&CONFIG, &metadata).await; - rbac::map::init(&metadata); - metadata.set_global(); - if let Some(cache_manager) = LocalCacheManager::global() { - cache_manager - .validate(CONFIG.parseable.local_cache_size) - .await?; - }; - let prometheus = metrics::build_metrics_handler(); - CONFIG.storage().register_store_metrics(&prometheus); - - migration::run_migration(&CONFIG).await?; - if let Err(e) = metadata::STREAM_INFO.load(&*storage).await { - log::warn!("could not populate local metadata. {:?}", e); - } + // these are empty ptrs so mem footprint should be minimal + let server: Arc = match CONFIG.parseable.mode { + Mode::Query => Arc::new(QueryServer), - // track all parquet files already in the data directory - storage::retention::load_retention_from_global().await; - // load data from stats back to prometheus metrics - metrics::load_from_stats_from_storage().await; + Mode::Ingest => Arc::new(IngestServer), - let (localsync_handler, mut localsync_outbox, localsync_inbox) = run_local_sync(); - let (mut remote_sync_handler, mut remote_sync_outbox, mut remote_sync_inbox) = - object_store_sync(); - - // all internal data structures populated now. - // start the analytics scheduler if enabled - if CONFIG.parseable.send_analytics { - analytics::init_analytics_scheduler(); - } - - tokio::spawn(handlers::livetail::server()); - - let app = handlers::http::run_http(prometheus, CONFIG.parseable.openid.clone()); - tokio::pin!(app); - loop { - tokio::select! { - e = &mut app => { - // actix server finished .. stop other threads and stop the server - remote_sync_inbox.send(()).unwrap_or(()); - localsync_inbox.send(()).unwrap_or(()); - localsync_handler.join().unwrap_or(()); - remote_sync_handler.join().unwrap_or(()); - return e - }, - _ = &mut localsync_outbox => { - // crash the server if localsync fails for any reason - // panic!("Local Sync thread died. Server will fail now!") - return Err(anyhow::Error::msg("Failed to sync local data to drive. Please restart the Parseable server.\n\nJoin us on Parseable Slack if the issue persists after restart : https://launchpass.com/parseable")) - }, - _ = &mut remote_sync_outbox => { - // remote_sync failed, this is recoverable by just starting remote_sync thread again - remote_sync_handler.join().unwrap_or(()); - (remote_sync_handler, remote_sync_outbox, remote_sync_inbox) = object_store_sync(); - } - - }; - } -} - -fn object_store_sync() -> (JoinHandle<()>, oneshot::Receiver<()>, oneshot::Sender<()>) { - let (outbox_tx, outbox_rx) = oneshot::channel::<()>(); - let (inbox_tx, inbox_rx) = oneshot::channel::<()>(); - let mut inbox_rx = AssertUnwindSafe(inbox_rx); - let handle = thread::spawn(move || { - let res = catch_unwind(move || { - let rt = actix_web::rt::System::new(); - rt.block_on(async { - let mut scheduler = AsyncScheduler::new(); - scheduler - .every((CONFIG.parseable.upload_interval as u32).seconds()) - // Extra time interval is added so that this schedular does not race with local sync. - .plus(5u32.seconds()) - .run(|| async { - if let Err(e) = CONFIG.storage().get_object_store().sync().await { - log::warn!("failed to sync local data with object store. {:?}", e); - } - }); - - loop { - tokio::time::sleep(Duration::from_secs(1)).await; - scheduler.run_pending().await; - match AssertUnwindSafe(|| inbox_rx.try_recv())() { - Ok(_) => break, - Err(TryRecvError::Empty) => continue, - Err(TryRecvError::Closed) => { - // should be unreachable but breaking anyways - break; - } - } - } - }) - }); - - if res.is_err() { - outbox_tx.send(()).unwrap(); - } - }); - - (handle, outbox_rx, inbox_tx) -} - -fn run_local_sync() -> (JoinHandle<()>, oneshot::Receiver<()>, oneshot::Sender<()>) { - let (outbox_tx, outbox_rx) = oneshot::channel::<()>(); - let (inbox_tx, inbox_rx) = oneshot::channel::<()>(); - let mut inbox_rx = AssertUnwindSafe(inbox_rx); - - let handle = ThreadBuilder::default() - .name("local-sync") - .priority(ThreadPriority::Max) - .spawn(move |priority_result| { - if priority_result.is_err() { - log::warn!("Max priority cannot be set for sync thread. Make sure that user/program is allowed to set thread priority.") - } - let res = catch_unwind(move || { - let mut scheduler = Scheduler::new(); - scheduler - .every((storage::LOCAL_SYNC_INTERVAL as u32).seconds()) - .run(move || crate::event::STREAM_WRITERS.unset_all()); - - loop { - thread::sleep(Duration::from_millis(50)); - scheduler.run_pending(); - match AssertUnwindSafe(|| inbox_rx.try_recv())() { - Ok(_) => break, - Err(TryRecvError::Empty) => continue, - Err(TryRecvError::Closed) => { - // should be unreachable but breaking anyways - break; - } - } - } - }); + Mode::All => Arc::new(Server), + }; - if res.is_err() { - outbox_tx.send(()).unwrap(); - } - }) - .unwrap(); + server.init().await?; - (handle, outbox_rx, inbox_tx) + Ok(()) } diff --git a/server/src/metadata.rs b/server/src/metadata.rs index b57cc710a..8fdb7597d 100644 --- a/server/src/metadata.rs +++ b/server/src/metadata.rs @@ -18,6 +18,7 @@ use arrow_array::RecordBatch; use arrow_schema::{Field, Fields, Schema}; +use chrono::Local; use itertools::Itertools; use once_cell::sync::Lazy; use std::collections::HashMap; @@ -25,7 +26,7 @@ use std::sync::{Arc, RwLock}; use crate::alerts::Alerts; use crate::metrics::{EVENTS_INGESTED, EVENTS_INGESTED_SIZE}; -use crate::storage::{ObjectStorage, StorageDir}; +use crate::storage::{LogStream, ObjectStorage, StorageDir}; use crate::utils::arrow::MergedRecordReader; use self::error::stream_info::{CheckAlertError, LoadError, MetadataError}; @@ -43,6 +44,10 @@ pub struct LogStreamMetadata { pub schema: HashMap>, pub alerts: Alerts, pub cache_enabled: bool, + pub created_at: String, + pub first_event_at: Option, + pub time_partition: Option, + pub static_schema_flag: Option, } // It is very unlikely that panic will occur when dealing with metadata. @@ -88,6 +93,23 @@ impl StreamInfo { .map(|metadata| metadata.cache_enabled) } + pub fn get_time_partition(&self, stream_name: &str) -> Result, MetadataError> { + let map = self.read().expect(LOCK_EXPECT); + map.get(stream_name) + .ok_or(MetadataError::StreamMetaNotFound(stream_name.to_string())) + .map(|metadata| metadata.time_partition.clone()) + } + + pub fn get_static_schema_flag( + &self, + stream_name: &str, + ) -> Result, MetadataError> { + let map = self.read().expect(LOCK_EXPECT); + map.get(stream_name) + .ok_or(MetadataError::StreamMetaNotFound(stream_name.to_string())) + .map(|metadata| metadata.static_schema_flag.clone()) + } + pub fn set_stream_cache(&self, stream_name: &str, enable: bool) -> Result<(), MetadataError> { let mut map = self.write().expect(LOCK_EXPECT); let stream = map @@ -126,9 +148,49 @@ impl StreamInfo { }) } - pub fn add_stream(&self, stream_name: String) { + pub fn set_first_event_at( + &self, + stream_name: &str, + first_event_at: Option, + ) -> Result<(), MetadataError> { + let mut map = self.write().expect(LOCK_EXPECT); + map.get_mut(stream_name) + .ok_or(MetadataError::StreamMetaNotFound(stream_name.to_string())) + .map(|metadata| { + metadata.first_event_at = first_event_at; + }) + } + + pub fn add_stream( + &self, + stream_name: String, + created_at: String, + time_partition: String, + static_schema_flag: String, + static_schema: HashMap>, + ) { let mut map = self.write().expect(LOCK_EXPECT); let metadata = LogStreamMetadata { + created_at: if created_at.is_empty() { + Local::now().to_rfc3339() + } else { + created_at + }, + time_partition: if time_partition.is_empty() { + None + } else { + Some(time_partition) + }, + static_schema_flag: if static_schema_flag != "true" { + None + } else { + Some(static_schema_flag) + }, + schema: if static_schema.is_empty() { + HashMap::new() + } else { + static_schema + }, ..Default::default() }; map.insert(stream_name, metadata); @@ -146,28 +208,41 @@ impl StreamInfo { // return error in case of an error from object storage itself. for stream in storage.list_streams().await? { - let alerts = storage.get_alerts(&stream.name).await?; - let schema = storage.get_schema(&stream.name).await?; - let meta = storage.get_stream_metadata(&stream.name).await?; - - let schema = update_schema_from_staging(&stream.name, schema); - let schema = HashMap::from_iter( - schema - .fields - .iter() - .map(|v| (v.name().to_owned(), v.clone())), - ); - - let metadata = LogStreamMetadata { - schema, - alerts, - cache_enabled: meta.cache_enabled, - }; - - let mut map = self.write().expect(LOCK_EXPECT); - - map.insert(stream.name, metadata); + self.upsert_stream_info(storage, stream).await?; } + Ok(()) + } + + pub async fn upsert_stream_info( + &self, + storage: &(impl ObjectStorage + ?Sized), + stream: LogStream, + ) -> Result<(), LoadError> { + let alerts = storage.get_alerts(&stream.name).await?; + let schema = storage.get_schema_on_server_start(&stream.name).await?; + let meta = storage.get_stream_metadata(&stream.name).await?; + + let schema = update_schema_from_staging(&stream.name, schema); + let schema = HashMap::from_iter( + schema + .fields + .iter() + .map(|v| (v.name().to_owned(), v.clone())), + ); + + let metadata = LogStreamMetadata { + schema, + alerts, + cache_enabled: meta.cache_enabled, + created_at: meta.created_at, + first_event_at: meta.first_event_at, + time_partition: meta.time_partition, + static_schema_flag: meta.static_schema_flag, + }; + + let mut map = self.write().expect(LOCK_EXPECT); + + map.insert(stream.name, metadata); Ok(()) } @@ -222,6 +297,8 @@ pub mod error { pub enum MetadataError { #[error("Metadata for stream {0} not found. Please create the stream and try again")] StreamMetaNotFound(String), + #[error("Metadata Error: {0}")] + StandaloneWithDistributed(String), } #[derive(Debug, thiserror::Error)] diff --git a/server/src/metrics/mod.rs b/server/src/metrics/mod.rs index 05e6baf86..3e337123d 100644 --- a/server/src/metrics/mod.rs +++ b/server/src/metrics/mod.rs @@ -16,6 +16,7 @@ * */ +pub mod prom_utils; pub mod storage; use actix_web_prometheus::{PrometheusMetrics, PrometheusMetricsBuilder}; @@ -133,7 +134,7 @@ fn prom_process_metrics(metrics: &PrometheusMetrics) { #[cfg(not(target_os = "linux"))] fn prom_process_metrics(_metrics: &PrometheusMetrics) {} -pub async fn load_from_stats_from_storage() { +pub async fn fetch_stats_from_storage() { for stream_name in STREAM_INFO.list_streams() { let stats = CONFIG .storage() diff --git a/server/src/metrics/prom_utils.rs b/server/src/metrics/prom_utils.rs new file mode 100644 index 000000000..21e27c03f --- /dev/null +++ b/server/src/metrics/prom_utils.rs @@ -0,0 +1,91 @@ +use crate::utils::get_url; +use prometheus_parse::Sample as PromSample; +use prometheus_parse::Value as PromValue; +use serde::Serialize; +use serde_json::Error as JsonError; +use serde_json::Value as JsonValue; + +#[derive(Debug, Serialize, Clone)] +pub struct Metrics { + address: String, + parseable_events_ingested: f64, // all streams + parseable_staging_files: f64, + process_resident_memory_bytes: f64, + parseable_storage_size: StorageMetrics, +} + +#[derive(Debug, Serialize, Default, Clone)] +struct StorageMetrics { + staging: f64, + data: f64, +} + +impl Default for Metrics { + fn default() -> Self { + let url = get_url(); + let address = format!( + "http://{}:{}", + url.domain() + .unwrap_or(url.host_str().expect("should have a host")), + url.port().unwrap_or_default() + ); + Metrics { + address, + parseable_events_ingested: 0.0, + parseable_staging_files: 0.0, + process_resident_memory_bytes: 0.0, + parseable_storage_size: StorageMetrics::default(), + } + } +} + +impl Metrics { + fn new(address: String) -> Self { + Metrics { + address, + parseable_events_ingested: 0.0, + parseable_staging_files: 0.0, + process_resident_memory_bytes: 0.0, + parseable_storage_size: StorageMetrics::default(), + } + } +} + +impl Metrics { + pub fn from_prometheus_samples(samples: Vec, address: String) -> Self { + let mut prom_dress = Metrics::new(address); + + for sample in samples { + if &sample.metric == "parseable_events_ingested" { + if let PromValue::Counter(val) = sample.value { + prom_dress.parseable_events_ingested += val; + } + } else if sample.metric == "parseable_staging_files" { + if let PromValue::Gauge(val) = sample.value { + prom_dress.parseable_staging_files += val; + } + } else if sample.metric == "process_resident_memory_bytes" { + if let PromValue::Gauge(val) = sample.value { + prom_dress.process_resident_memory_bytes += val; + } + } else if sample.metric == "parseable_storage_size" { + if sample.labels.get("type").expect("type is present") == "data" { + if let PromValue::Gauge(val) = sample.value { + prom_dress.parseable_storage_size.data += val; + } + } else if sample.labels.get("type").expect("type is present") == "staging" { + if let PromValue::Gauge(val) = sample.value { + prom_dress.parseable_storage_size.staging += val; + } + } + } + } + + prom_dress + } + + #[allow(unused)] + pub fn to_json(&self) -> Result { + serde_json::to_value(self) + } +} diff --git a/server/src/migration.rs b/server/src/migration.rs index 899d86eae..9e7e9a3db 100644 --- a/server/src/migration.rs +++ b/server/src/migration.rs @@ -21,17 +21,24 @@ mod metadata_migration; mod schema_migration; mod stream_metadata_migration; -use std::fs::OpenOptions; +use std::{fs::OpenOptions, sync::Arc}; use bytes::Bytes; +use itertools::Itertools; use relative_path::RelativePathBuf; use serde::Serialize; use crate::{ option::Config, - storage::{ObjectStorage, ObjectStorageError}, + storage::{ + object_storage::{parseable_json_path, stream_json_path}, + ObjectStorage, ObjectStorageError, PARSEABLE_METADATA_FILE_NAME, PARSEABLE_ROOT_DIRECTORY, + SCHEMA_FILE_NAME, STREAM_ROOT_DIRECTORY, + }, }; +/// Migrate the metdata from v1 or v2 to v3 +/// This is a one time migration pub async fn run_metadata_migration(config: &Config) -> anyhow::Result<()> { let object_store = config.storage().get_object_store(); let storage_metadata = get_storage_metadata(&*object_store).await?; @@ -44,6 +51,7 @@ pub async fn run_metadata_migration(config: &Config) -> anyhow::Result<()> { .and_then(|version| version.as_str()) } + // if storage metadata is none do nothing if let Some(storage_metadata) = storage_metadata { match get_version(&storage_metadata) { Some("v1") => { @@ -54,10 +62,15 @@ pub async fn run_metadata_migration(config: &Config) -> anyhow::Result<()> { let metadata = metadata_migration::v2_v3(storage_metadata); put_remote_metadata(&*object_store, &metadata).await?; } + Some("v3") => { + let mdata = metadata_migration::update_v3(storage_metadata); + put_remote_metadata(&*object_store, &mdata).await?; + } _ => (), } } + // if staging metadata is none do nothing if let Some(staging_metadata) = staging_metadata { match get_version(&staging_metadata) { Some("v1") => { @@ -68,6 +81,10 @@ pub async fn run_metadata_migration(config: &Config) -> anyhow::Result<()> { let metadata = metadata_migration::v2_v3(staging_metadata); put_staging_metadata(config, &metadata)?; } + Some("v3") => { + let mdata = metadata_migration::update_v3(staging_metadata); + put_staging_metadata(config, &mdata)?; + } _ => (), } } @@ -75,6 +92,7 @@ pub async fn run_metadata_migration(config: &Config) -> anyhow::Result<()> { Ok(()) } +/// run the migration for all streams pub async fn run_migration(config: &Config) -> anyhow::Result<()> { let storage = config.storage().get_object_store(); let streams = storage.list_streams().await?; @@ -87,7 +105,8 @@ pub async fn run_migration(config: &Config) -> anyhow::Result<()> { } async fn migration_stream(stream: &str, storage: &dyn ObjectStorage) -> anyhow::Result<()> { - let path = RelativePathBuf::from_iter([stream, ".stream.json"]); + let path = stream_json_path(stream); + let stream_metadata = storage.get_object(&path).await?; let stream_metadata: serde_json::Value = serde_json::from_slice(&stream_metadata).expect("stream.json is valid json"); @@ -104,7 +123,8 @@ async fn migration_stream(stream: &str, storage: &dyn ObjectStorage) -> anyhow:: .put_object(&path, to_bytes(&new_stream_metadata)) .await?; - let schema_path = RelativePathBuf::from_iter([stream, ".schema"]); + let schema_path = + RelativePathBuf::from_iter([stream, STREAM_ROOT_DIRECTORY, SCHEMA_FILE_NAME]); let schema = storage.get_object(&schema_path).await?; let schema = serde_json::from_slice(&schema).ok(); let map = schema_migration::v1_v3(schema)?; @@ -116,7 +136,8 @@ async fn migration_stream(stream: &str, storage: &dyn ObjectStorage) -> anyhow:: .put_object(&path, to_bytes(&new_stream_metadata)) .await?; - let schema_path = RelativePathBuf::from_iter([stream, ".schema"]); + let schema_path = + RelativePathBuf::from_iter([stream, STREAM_ROOT_DIRECTORY, SCHEMA_FILE_NAME]); let schema = storage.get_object(&schema_path).await?; let schema = serde_json::from_slice(&schema)?; let map = schema_migration::v2_v3(schema)?; @@ -136,7 +157,8 @@ fn to_bytes(any: &(impl ?Sized + Serialize)) -> Bytes { } pub fn get_staging_metadata(config: &Config) -> anyhow::Result> { - let path = config.staging_dir().join(".parseable.json"); + let path = parseable_json_path().to_path(config.staging_dir()); + let bytes = match std::fs::read(path) { Ok(bytes) => bytes, Err(err) => match err.kind() { @@ -145,13 +167,14 @@ pub fn get_staging_metadata(config: &Config) -> anyhow::Result anyhow::Result> { - let path = RelativePathBuf::from_iter([".parseable.json"]); + let path = parseable_json_path(); match storage.get_object(&path).await { Ok(bytes) => Ok(Some( serde_json::from_slice(&bytes).expect("parseable config is valid json"), @@ -170,13 +193,13 @@ pub async fn put_remote_metadata( storage: &dyn ObjectStorage, metadata: &serde_json::Value, ) -> anyhow::Result<()> { - let path = RelativePathBuf::from_iter([".parseable.json"]); + let path = parseable_json_path(); let metadata = serde_json::to_vec(metadata)?.into(); Ok(storage.put_object(&path, metadata).await?) } pub fn put_staging_metadata(config: &Config, metadata: &serde_json::Value) -> anyhow::Result<()> { - let path = config.staging_dir().join(".parseable.json"); + let path = parseable_json_path().to_path(config.staging_dir()); let mut file = OpenOptions::new() .create(true) .truncate(true) @@ -185,3 +208,92 @@ pub fn put_staging_metadata(config: &Config, metadata: &serde_json::Value) -> an serde_json::to_writer(&mut file, metadata)?; Ok(()) } + +pub async fn run_file_migration(config: &Config) -> anyhow::Result<()> { + let object_store = config.storage().get_object_store(); + + let old_meta_file_path = RelativePathBuf::from(PARSEABLE_METADATA_FILE_NAME); + + // if this errors that means migrations is already done + if let Err(err) = object_store.get_object(&old_meta_file_path).await { + if matches!(err, ObjectStorageError::NoSuchKey(_)) { + return Ok(()); + } + return Err(err.into()); + } + + run_meta_file_migration(&object_store, old_meta_file_path).await?; + run_stream_files_migration(object_store).await?; + + Ok(()) +} + +async fn run_meta_file_migration( + object_store: &Arc, + old_meta_file_path: RelativePathBuf, +) -> anyhow::Result<()> { + log::info!("Migrating metadata files to new location"); + + // get the list of all meta files + let mut meta_files = object_store.get_ingestor_meta_file_paths().await?; + meta_files.push(old_meta_file_path); + + for file in meta_files { + match object_store.get_object(&file).await { + Ok(bytes) => { + // we can unwrap here because we know the file exists + let new_path = RelativePathBuf::from_iter([ + PARSEABLE_ROOT_DIRECTORY, + file.file_name().expect("should have a file name"), + ]); + object_store.put_object(&new_path, bytes).await?; + object_store.delete_object(&file).await?; + } + Err(err) => { + // if error is not a no such key error, something weird happened + // so return the error + if !matches!(err, ObjectStorageError::NoSuchKey(_)) { + return Err(err.into()); + } + } + } + } + + Ok(()) +} + +async fn run_stream_files_migration( + object_store: Arc, +) -> anyhow::Result<()> { + let streams = object_store + .list_old_streams() + .await? + .into_iter() + .map(|stream| stream.name) + .collect_vec(); + + for stream in streams { + let paths = object_store.get_stream_file_paths(&stream).await?; + + for path in paths { + match object_store.get_object(&path).await { + Ok(bytes) => { + let new_path = RelativePathBuf::from_iter([ + stream.as_str(), + STREAM_ROOT_DIRECTORY, + path.file_name().expect("should have a file name"), + ]); + object_store.put_object(&new_path, bytes).await?; + object_store.delete_object(&path).await?; + } + Err(err) => { + if !matches!(err, ObjectStorageError::NoSuchKey(_)) { + return Err(err.into()); + } + } + } + } + } + + Ok(()) +} diff --git a/server/src/migration/metadata_migration.rs b/server/src/migration/metadata_migration.rs index 36507a28f..cbeee200a 100644 --- a/server/src/migration/metadata_migration.rs +++ b/server/src/migration/metadata_migration.rs @@ -17,22 +17,54 @@ */ use rand::distributions::DistString; -use serde_json::{Map, Value}; +use serde_json::{Map, Value as JsonValue}; -pub fn v1_v3(mut storage_metadata: serde_json::Value) -> Value { +use crate::option::CONFIG; + +/* +v1 +{ + "version": "v1", + "mode": "drive" + "user": string, + "staging": "string", + "storage": "string", + "deployment_id": "string" + "stream": string, + "default_role": null +} +*/ +pub fn v1_v3(mut storage_metadata: JsonValue) -> JsonValue { let metadata = storage_metadata.as_object_mut().unwrap(); - *metadata.get_mut("version").unwrap() = Value::String("v3".to_string()); + *metadata.get_mut("version").unwrap() = JsonValue::String("v3".to_string()); metadata.remove("user"); metadata.remove("stream"); - metadata.insert("users".to_string(), Value::Array(vec![])); - metadata.insert("streams".to_string(), Value::Array(vec![])); - metadata.insert("roles".to_string(), Value::Array(vec![])); + metadata.insert("users".to_string(), JsonValue::Array(vec![])); + metadata.insert("streams".to_string(), JsonValue::Array(vec![])); + metadata.insert("roles".to_string(), JsonValue::Array(vec![])); + metadata.insert( + "server_mode".to_string(), + JsonValue::String(CONFIG.parseable.mode.to_string()), + ); storage_metadata } -pub fn v2_v3(mut storage_metadata: serde_json::Value) -> Value { +/* +v2 +{ + "version": "v2", + "users": [ + { + "role": ["privilege1", "privilege2", ...] + }, + ... + ] + ... +} +*/ +pub fn v2_v3(mut storage_metadata: JsonValue) -> JsonValue { let metadata = storage_metadata.as_object_mut().unwrap(); - *metadata.get_mut("version").unwrap() = Value::String("v3".to_string()); + *metadata.get_mut("version").unwrap() = JsonValue::String("v3".to_string()); let users = metadata .get_mut("users") .expect("users field is present") @@ -46,7 +78,7 @@ pub fn v2_v3(mut storage_metadata: serde_json::Value) -> Value { // user is an object let user = user.as_object_mut().unwrap(); // take out privileges - let Value::Array(privileges) = user.remove("role").expect("role exists for v2") else { + let JsonValue::Array(privileges) = user.remove("role").expect("role exists for v2") else { panic!("privileges is an arrray") }; @@ -55,15 +87,34 @@ pub fn v2_v3(mut storage_metadata: serde_json::Value) -> Value { if !privileges.is_empty() { let role_name = rand::distributions::Alphanumeric.sample_string(&mut rand::thread_rng(), 8); - privileges_map.push((role_name.clone(), Value::Array(privileges))); - roles.push(Value::from(role_name)); + privileges_map.push((role_name.clone(), JsonValue::Array(privileges))); + roles.push(JsonValue::from(role_name)); } user.insert("roles".to_string(), roles.into()); } metadata.insert( "roles".to_string(), - Value::Object(Map::from_iter(privileges_map)), + JsonValue::Object(Map::from_iter(privileges_map)), + ); + metadata.insert( + "server_mode".to_string(), + JsonValue::String(CONFIG.parseable.mode.to_string()), ); storage_metadata } + +// maybe rename +pub fn update_v3(mut storage_metadata: JsonValue) -> JsonValue { + let metadata = storage_metadata.as_object_mut().unwrap(); + let sm = metadata.get("server_mode"); + + if sm.is_none() || sm.unwrap().as_str().unwrap() == "All" { + metadata.insert( + "server_mode".to_string(), + JsonValue::String(CONFIG.parseable.mode.to_string()), + ); + } + + storage_metadata +} diff --git a/server/src/option.rs b/server/src/option.rs index 8b3983170..0fb12f7f4 100644 --- a/server/src/option.rs +++ b/server/src/option.rs @@ -17,37 +17,44 @@ */ use clap::error::ErrorKind; -use clap::{command, value_parser, Arg, ArgGroup, Args, Command, FromArgMatches}; +use clap::{command, Args, Command, FromArgMatches}; use once_cell::sync::Lazy; use parquet::basic::{BrotliLevel, GzipLevel, ZstdLevel}; use std::env; use std::path::PathBuf; use std::sync::Arc; -use url::Url; - -use crate::oidc::{self, OpenidConfig}; -use crate::storage::{FSConfig, ObjectStorageProvider, S3Config}; -use crate::utils::validate_path_is_writeable; +use crate::cli::Cli; +use crate::storage::object_storage::parseable_json_path; +use crate::storage::{FSConfig, ObjectStorageError, ObjectStorageProvider, S3Config}; pub const MIN_CACHE_SIZE_BYTES: u64 = 1000u64.pow(3); // 1 GiB - +pub const JOIN_COMMUNITY: &str = + "Join us on Parseable Slack community for questions : https://logg.ing/community"; pub static CONFIG: Lazy> = Lazy::new(|| Arc::new(Config::new())); #[derive(Debug)] pub struct Config { - pub parseable: Server, + pub parseable: Cli, storage: Arc, pub storage_name: &'static str, } impl Config { fn new() -> Self { - let cli = parseable_cli_command().get_matches(); + let cli = create_parseable_cli_command() + .name("Parseable") + .about("A Cloud Native, log analytics platform") + .before_help("Log Lake for the cloud-native world") + .arg_required_else_help(true) + .subcommand_required(true) + .color(clap::ColorChoice::Always) + .get_matches(); + match cli.subcommand() { Some(("local-store", m)) => { - let server = match Server::from_arg_matches(m) { - Ok(server) => server, + let cli = match Cli::from_arg_matches(m) { + Ok(cli) => cli, Err(err) => err.exit(), }; let storage = match FSConfig::from_arg_matches(m) { @@ -55,8 +62,8 @@ impl Config { Err(err) => err.exit(), }; - if server.local_staging_path == storage.root { - parseable_cli_command() + if cli.local_staging_path == storage.root { + create_parseable_cli_command() .error( ErrorKind::ValueValidation, "Cannot use same path for storage and staging", @@ -64,8 +71,8 @@ impl Config { .exit() } - if server.local_cache_path.is_some() { - parseable_cli_command() + if cli.local_cache_path.is_some() { + create_parseable_cli_command() .error( ErrorKind::ValueValidation, "Cannot use cache with local-store subcommand.", @@ -74,14 +81,14 @@ impl Config { } Config { - parseable: server, + parseable: cli, storage: Arc::new(storage), storage_name: "drive", } } Some(("s3-store", m)) => { - let server = match Server::from_arg_matches(m) { - Ok(server) => server, + let cli = match Cli::from_arg_matches(m) { + Ok(cli) => cli, Err(err) => err.exit(), }; let storage = match S3Config::from_arg_matches(m) { @@ -90,7 +97,7 @@ impl Config { }; Config { - parseable: server, + parseable: cli, storage: Arc::new(storage), storage_name: "s3", } @@ -99,9 +106,33 @@ impl Config { } } - pub fn validate_staging(&self) -> anyhow::Result<()> { - let staging_path = self.staging_dir(); - validate_path_is_writeable(staging_path) + // validate the storage, if the proper path for staging directory is provided + // if the proper data directory is provided, or s3 bucket is provided etc + pub async fn validate_storage(&self) -> Result<(), ObjectStorageError> { + let obj_store = self.storage.get_object_store(); + let rel_path = parseable_json_path(); + + let has_parseable_json = obj_store.get_object(&rel_path).await.is_ok(); + + // Lists all the directories in the root of the bucket/directory + // can be a stream (if it contains .stream.json file) or not + let has_dirs = match obj_store.list_dirs().await { + Ok(dirs) => !dirs.is_empty(), + Err(_) => false, + }; + + let has_streams = obj_store.list_streams().await.is_ok(); + + if has_streams || !has_dirs && !has_parseable_json { + return Ok(()); + } + + if self.get_storage_mode_string() == "Local drive" { + return Err(ObjectStorageError::Custom(format!("Could not start the server because directory '{}' contains stale data, please use an empty directory, and restart the server.\n{}", self.storage.get_endpoint(), JOIN_COMMUNITY))); + } + + // S3 bucket mode + Err(ObjectStorageError::Custom(format!("Could not start the server because bucket '{}' contains stale data, please use an empty bucket and restart the server.\n{}", self.storage.get_endpoint(), JOIN_COMMUNITY))) } pub fn storage(&self) -> Arc { @@ -121,34 +152,41 @@ impl Config { } pub fn is_default_creds(&self) -> bool { - self.parseable.username == Server::DEFAULT_USERNAME - && self.parseable.password == Server::DEFAULT_PASSWORD + self.parseable.username == Cli::DEFAULT_USERNAME + && self.parseable.password == Cli::DEFAULT_PASSWORD } // returns the string representation of the storage mode // drive --> Local drive // s3 --> S3 bucket - pub fn mode_string(&self) -> &str { - let mut mode = "S3 bucket"; + pub fn get_storage_mode_string(&self) -> &str { if self.storage_name == "drive" { - mode = "Local drive"; + return "Local drive"; + } + "S3 bucket" + } + + pub fn get_server_mode_string(&self) -> &str { + match self.parseable.mode { + Mode::Query => "Distributed (Query)", + Mode::Ingest => "Distributed (Ingest)", + Mode::All => "Standalone", } - mode } } -fn parseable_cli_command() -> Command { - let local = Server::get_clap_command("local-store"); +fn create_parseable_cli_command() -> Command { + let local = Cli::create_cli_command_with_clap("local-store"); let local = ::augment_args_for_update(local); let local = local - .mut_arg(Server::USERNAME, |arg| { - arg.required(false).default_value(Server::DEFAULT_USERNAME) + .mut_arg(Cli::USERNAME, |arg| { + arg.required(false).default_value(Cli::DEFAULT_USERNAME) }) - .mut_arg(Server::PASSWORD, |arg| { - arg.required(false).default_value(Server::DEFAULT_PASSWORD) + .mut_arg(Cli::PASSWORD, |arg| { + arg.required(false).default_value(Cli::DEFAULT_PASSWORD) }); - let s3 = Server::get_clap_command("s3-store"); + let s3 = Cli::create_cli_command_with_clap("s3-store"); let s3 = ::augment_args_for_update(s3); command!() @@ -159,7 +197,8 @@ fn parseable_cli_command() -> Command { .next_line_help(false) .help_template( r#" -{about} Join the community at https://launchpass.com/parseable. +{about} +Join the community at https://logg.ing/community. {all-args} "#, @@ -168,417 +207,36 @@ fn parseable_cli_command() -> Command { .subcommands([local, s3]) } -#[derive(Debug, Default)] -pub struct Server { - /// The location of TLS Cert file - pub tls_cert_path: Option, - - /// The location of TLS Private Key file - pub tls_key_path: Option, - - /// The address on which the http server will listen. - pub address: String, - - /// Base domain under which server is hosted. - /// This information is used by OIDC to refer redirects - pub domain_address: Option, - - /// The local staging path is used as a temporary landing point - /// for incoming events and local cache - pub local_staging_path: PathBuf, - - /// The local cache path is used for speeding up query on latest data - pub local_cache_path: Option, - - /// Size for local cache - pub local_cache_size: u64, - - /// Interval in seconds after which uncommited data would be - /// uploaded to the storage platform. - pub upload_interval: u64, - - /// Username for the basic authentication on the server - pub username: String, - - /// Password for the basic authentication on the server - pub password: String, - - /// OpenId configuration - pub openid: Option, - - /// Server should check for update or not - pub check_update: bool, - - /// Server should send anonymous analytics or not - pub send_analytics: bool, - - /// Open AI access key - pub open_ai_key: Option, - - /// Livetail port - pub grpc_port: u16, - - /// Livetail channel capacity - pub livetail_channel_capacity: usize, - - /// Rows in Parquet Rowgroup - pub row_group_size: usize, - - /// Query memory limit in bytes - pub query_memory_pool_size: Option, - - /// Parquet compression algorithm - pub parquet_compression: Compression, -} - -impl FromArgMatches for Server { - fn from_arg_matches(m: &clap::ArgMatches) -> Result { - let mut s: Self = Self::default(); - s.update_from_arg_matches(m)?; - Ok(s) - } - - fn update_from_arg_matches(&mut self, m: &clap::ArgMatches) -> Result<(), clap::Error> { - self.local_cache_path = m.get_one::(Self::CACHE).cloned(); - self.tls_cert_path = m.get_one::(Self::TLS_CERT).cloned(); - self.tls_key_path = m.get_one::(Self::TLS_KEY).cloned(); - self.domain_address = m.get_one::(Self::DOMAIN_URI).cloned(); - let openid_client_id = m.get_one::(Self::OPENID_CLIENT_ID).cloned(); - let openid_client_secret = m.get_one::(Self::OPENID_CLIENT_SECRET).cloned(); - let openid_issuer = m.get_one::(Self::OPENID_ISSUER).cloned(); - - self.address = m - .get_one::(Self::ADDRESS) - .cloned() - .expect("default value for address"); - self.local_staging_path = m - .get_one::(Self::STAGING) - .cloned() - .expect("default value for staging"); - self.local_cache_size = m - .get_one::(Self::CACHE_SIZE) - .cloned() - .expect("default value for cache size"); - self.upload_interval = m - .get_one::(Self::UPLOAD_INTERVAL) - .cloned() - .expect("default value for upload"); - self.username = m - .get_one::(Self::USERNAME) - .cloned() - .expect("default for username"); - self.password = m - .get_one::(Self::PASSWORD) - .cloned() - .expect("default for password"); - self.check_update = m - .get_one::(Self::CHECK_UPDATE) - .cloned() - .expect("default for check update"); - self.send_analytics = m - .get_one::(Self::SEND_ANALYTICS) - .cloned() - .expect("default for send analytics"); - self.open_ai_key = m.get_one::(Self::OPEN_AI_KEY).cloned(); - self.grpc_port = m - .get_one::(Self::GRPC_PORT) - .cloned() - .expect("default for livetail port"); - self.livetail_channel_capacity = m - .get_one::(Self::LIVETAIL_CAPACITY) - .cloned() - .expect("default for livetail capacity"); - // converts Gib to bytes before assigning - self.query_memory_pool_size = m - .get_one::(Self::QUERY_MEM_POOL_SIZE) - .cloned() - .map(|gib| gib as usize * 1024usize.pow(3)); - self.row_group_size = m - .get_one::(Self::ROW_GROUP_SIZE) - .cloned() - .expect("default for row_group size"); - self.parquet_compression = match m - .get_one::(Self::PARQUET_COMPRESSION_ALGO) - .expect("default for compression algo") - .as_str() - { - "uncompressed" => Compression::UNCOMPRESSED, - "snappy" => Compression::SNAPPY, - "gzip" => Compression::GZIP, - "lzo" => Compression::LZO, - "brotli" => Compression::BROTLI, - "lz4" => Compression::LZ4, - "zstd" => Compression::ZSTD, - _ => unreachable!(), - }; - - self.openid = match (openid_client_id, openid_client_secret, openid_issuer) { - (Some(id), Some(secret), Some(issuer)) => { - let origin = if let Some(url) = self.domain_address.clone() { - oidc::Origin::Production(url) - } else { - oidc::Origin::Local { - socket_addr: self.address.clone(), - https: self.tls_cert_path.is_some() && self.tls_key_path.is_some(), - } - }; - Some(OpenidConfig { - id, - secret, - issuer, - origin, - }) - } - _ => None, - }; - - Ok(()) - } +#[derive(Debug, Default, Eq, PartialEq)] +pub enum Mode { + Query, + Ingest, + #[default] + All, } -impl Server { - // identifiers for arguments - pub const TLS_CERT: &'static str = "tls-cert-path"; - pub const TLS_KEY: &'static str = "tls-key-path"; - pub const ADDRESS: &'static str = "address"; - pub const DOMAIN_URI: &'static str = "origin"; - pub const STAGING: &'static str = "local-staging-path"; - pub const CACHE: &'static str = "cache-path"; - pub const CACHE_SIZE: &'static str = "cache-size"; - pub const UPLOAD_INTERVAL: &'static str = "upload-interval"; - pub const USERNAME: &'static str = "username"; - pub const PASSWORD: &'static str = "password"; - pub const CHECK_UPDATE: &'static str = "check-update"; - pub const SEND_ANALYTICS: &'static str = "send-analytics"; - pub const OPEN_AI_KEY: &'static str = "open-ai-key"; - pub const OPENID_CLIENT_ID: &'static str = "oidc-client"; - pub const OPENID_CLIENT_SECRET: &'static str = "oidc-client-secret"; - pub const OPENID_ISSUER: &'static str = "oidc-issuer"; - pub const GRPC_PORT: &'static str = "grpc-port"; - pub const LIVETAIL_CAPACITY: &'static str = "livetail-capacity"; - // todo : what should this flag be - pub const QUERY_MEM_POOL_SIZE: &'static str = "query-mempool-size"; - pub const ROW_GROUP_SIZE: &'static str = "row-group-size"; - pub const PARQUET_COMPRESSION_ALGO: &'static str = "compression-algo"; - pub const DEFAULT_USERNAME: &'static str = "admin"; - pub const DEFAULT_PASSWORD: &'static str = "admin"; - - pub fn local_stream_data_path(&self, stream_name: &str) -> PathBuf { - self.local_staging_path.join(stream_name) +impl Mode { + pub fn to_str(&self) -> &str { + match self { + Mode::Query => "Query", + Mode::Ingest => "Ingest", + Mode::All => "All", + } } - pub fn get_scheme(&self) -> String { - if self.tls_cert_path.is_some() && self.tls_key_path.is_some() { - return "https".to_string(); + pub fn from_string(mode: &str) -> Result { + match mode { + "Query" => Ok(Mode::Query), + "Ingest" => Ok(Mode::Ingest), + "All" => Ok(Mode::All), + x => Err(format!("Trying to Parse Invalid mode: {}", x)), } - "http".to_string() } +} - pub fn get_clap_command(name: &'static str) -> Command { - Command::new(name).next_line_help(false) - .arg( - Arg::new(Self::TLS_CERT) - .long(Self::TLS_CERT) - .env("P_TLS_CERT_PATH") - .value_name("PATH") - .value_parser(validation::file_path) - .help("Local path on this device where certificate file is located. Required to enable TLS"), - ) - .arg( - Arg::new(Self::TLS_KEY) - .long(Self::TLS_KEY) - .env("P_TLS_KEY_PATH") - .value_name("PATH") - .value_parser(validation::file_path) - .help("Local path on this device where private key file is located. Required to enable TLS"), - ) - .arg( - Arg::new(Self::ADDRESS) - .long(Self::ADDRESS) - .env("P_ADDR") - .value_name("ADDR:PORT") - .default_value("0.0.0.0:8000") - .value_parser(validation::socket_addr) - .help("Address and port for Parseable HTTP(s) server"), - ) - .arg( - Arg::new(Self::STAGING) - .long(Self::STAGING) - .env("P_STAGING_DIR") - .value_name("DIR") - .default_value("./staging") - .value_parser(validation::canonicalize_path) - .help("Local path on this device to be used as landing point for incoming events") - .next_line_help(true), - ) - .arg( - Arg::new(Self::CACHE) - .long(Self::CACHE) - .env("P_CACHE_DIR") - .value_name("DIR") - .value_parser(validation::canonicalize_path) - .help("Local path on this device to be used for caching data") - .next_line_help(true), - ) - .arg( - Arg::new(Self::CACHE_SIZE) - .long(Self::CACHE_SIZE) - .env("P_CACHE_SIZE") - .value_name("size") - .default_value("1GiB") - .value_parser(validation::cache_size) - .help("Maximum allowed cache size for all streams combined (In human readable format, e.g 1GiB, 2GiB, 100MB)") - .next_line_help(true), - ) - .arg( - Arg::new(Self::UPLOAD_INTERVAL) - .long(Self::UPLOAD_INTERVAL) - .env("P_STORAGE_UPLOAD_INTERVAL") - .value_name("SECONDS") - .default_value("60") - .value_parser(validation::upload_interval) - .help("Interval in seconds after which staging data would be sent to the storage") - .next_line_help(true), - ) - .arg( - Arg::new(Self::USERNAME) - .long(Self::USERNAME) - .env("P_USERNAME") - .value_name("STRING") - .required(true) - .help("Admin username to be set for this Parseable server"), - ) - .arg( - Arg::new(Self::PASSWORD) - .long(Self::PASSWORD) - .env("P_PASSWORD") - .value_name("STRING") - .required(true) - .help("Admin password to be set for this Parseable server"), - ) - .arg( - Arg::new(Self::CHECK_UPDATE) - .long(Self::CHECK_UPDATE) - .env("P_CHECK_UPDATE") - .value_name("BOOL") - .required(false) - .default_value("true") - .value_parser(value_parser!(bool)) - .help("Enable/Disable checking for new Parseable release"), - ) - .arg( - Arg::new(Self::SEND_ANALYTICS) - .long(Self::SEND_ANALYTICS) - .env("P_SEND_ANONYMOUS_USAGE_DATA") - .value_name("BOOL") - .required(false) - .default_value("true") - .value_parser(value_parser!(bool)) - .help("Enable/Disable anonymous telemetry data collection"), - ) - .arg( - Arg::new(Self::OPEN_AI_KEY) - .long(Self::OPEN_AI_KEY) - .env("P_OPENAI_API_KEY") - .value_name("STRING") - .required(false) - .help("OpenAI key to enable llm features"), - ) - .arg( - Arg::new(Self::OPENID_CLIENT_ID) - .long(Self::OPENID_CLIENT_ID) - .env("P_OIDC_CLIENT_ID") - .value_name("STRING") - .required(false) - .help("Client id for OIDC provider"), - ) - .arg( - Arg::new(Self::OPENID_CLIENT_SECRET) - .long(Self::OPENID_CLIENT_SECRET) - .env("P_OIDC_CLIENT_SECRET") - .value_name("STRING") - .required(false) - .help("Client secret for OIDC provider"), - ) - .arg( - Arg::new(Self::OPENID_ISSUER) - .long(Self::OPENID_ISSUER) - .env("P_OIDC_ISSUER") - .value_name("URl") - .required(false) - .value_parser(validation::url) - .help("OIDC provider's host address"), - ) - .arg( - Arg::new(Self::DOMAIN_URI) - .long(Self::DOMAIN_URI) - .env("P_ORIGIN_URI") - .value_name("URL") - .required(false) - .value_parser(validation::url) - .help("Parseable server global domain address"), - ) - .arg( - Arg::new(Self::GRPC_PORT) - .long(Self::GRPC_PORT) - .env("P_GRPC_PORT") - .value_name("PORT") - .default_value("8001") - .required(false) - .value_parser(value_parser!(u16)) - .help("Port for gRPC server"), - ) - .arg( - Arg::new(Self::LIVETAIL_CAPACITY) - .long(Self::LIVETAIL_CAPACITY) - .env("P_LIVETAIL_CAPACITY") - .value_name("NUMBER") - .default_value("1000") - .required(false) - .value_parser(value_parser!(usize)) - .help("Number of rows in livetail channel"), - ) - .arg( - Arg::new(Self::QUERY_MEM_POOL_SIZE) - .long(Self::QUERY_MEM_POOL_SIZE) - .env("P_QUERY_MEMORY_LIMIT") - .value_name("Gib") - .required(false) - .value_parser(value_parser!(u8)) - .help("Set a fixed memory limit for query"), - ) - .arg( - Arg::new(Self::ROW_GROUP_SIZE) - .long(Self::ROW_GROUP_SIZE) - .env("P_PARQUET_ROW_GROUP_SIZE") - .value_name("NUMBER") - .required(false) - .default_value("16384") - .value_parser(value_parser!(usize)) - .help("Number of rows in a row group"), - ) - .arg( - Arg::new(Self::PARQUET_COMPRESSION_ALGO) - .long(Self::PARQUET_COMPRESSION_ALGO) - .env("P_PARQUET_COMPRESSION_ALGO") - .value_name("[UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, LZ4, ZSTD]") - .required(false) - .default_value("lz4") - .value_parser([ - "uncompressed", - "snappy", - "gzip", - "lzo", - "brotli", - "lz4", - "zstd"]) - .help("Parquet compression algorithm"), - ).group( - ArgGroup::new("oidc") - .args([Self::OPENID_CLIENT_ID, Self::OPENID_CLIENT_SECRET, Self::OPENID_ISSUER]) - .requires_all([Self::OPENID_CLIENT_ID, Self::OPENID_CLIENT_SECRET, Self::OPENID_ISSUER]) - .multiple(true) - ) +impl ToString for Mode { + fn to_string(&self) -> String { + self.to_str().to_string() } } @@ -620,7 +278,6 @@ pub mod validation { use path_clean::PathClean; use crate::option::MIN_CACHE_SIZE_BYTES; - use crate::storage::LOCAL_SYNC_INTERVAL; use human_size::{multiples, SpecificSize}; pub fn file_path(s: &str) -> Result { @@ -698,14 +355,4 @@ pub mod validation { } Ok(size) } - - pub fn upload_interval(s: &str) -> Result { - let u = s - .parse::() - .map_err(|_| "invalid upload interval".to_string())?; - if u < LOCAL_SYNC_INTERVAL { - return Err("object storage upload interval must be 60 seconds or more".to_string()); - } - Ok(u) - } } diff --git a/server/src/query.rs b/server/src/query.rs index e3f9d8dbc..c3abe3d61 100644 --- a/server/src/query.rs +++ b/server/src/query.rs @@ -33,24 +33,24 @@ use datafusion::logical_expr::{Explain, Filter, LogicalPlan, PlanType, ToStringi use datafusion::prelude::*; use itertools::Itertools; use once_cell::sync::Lazy; +use serde_json::{json, Value}; use std::collections::HashMap; use std::path::{Path, PathBuf}; use std::sync::Arc; use sysinfo::{System, SystemExt}; -use crate::event; -use crate::option::CONFIG; -use crate::storage::{ObjectStorageProvider, StorageDir}; - use self::error::ExecuteError; - use self::stream_schema_provider::GlobalSchemaProvider; pub use self::stream_schema_provider::PartialTimeFilter; +use crate::event; +use crate::option::CONFIG; +use crate::storage::{ObjectStorageProvider, StorageDir}; pub static QUERY_SESSION: Lazy = Lazy::new(|| Query::create_session_context(CONFIG.storage())); // A query request by client +#[derive(Debug)] pub struct Query { pub raw_logical_plan: LogicalPlan, pub start: DateTime, @@ -102,9 +102,16 @@ impl Query { SessionContext::new_with_state(state) } - pub async fn execute(&self) -> Result<(Vec, Vec), ExecuteError> { + pub async fn execute( + &self, + stream_name: String, + ) -> Result<(Vec, Vec), ExecuteError> { + let store = CONFIG.storage().get_object_store(); + let object_store_format = store.get_object_store_format(&stream_name).await?; + let time_partition = object_store_format.time_partition; + let df = QUERY_SESSION - .execute_logical_plan(self.final_logical_plan()) + .execute_logical_plan(self.final_logical_plan(&time_partition)) .await?; let fields = df @@ -115,17 +122,22 @@ impl Query { .cloned() .collect_vec(); + if fields.is_empty() { + return Ok((vec![], fields)); + } + let results = df.collect().await?; Ok((results, fields)) } /// return logical plan with all time filters applied through - fn final_logical_plan(&self) -> LogicalPlan { + fn final_logical_plan(&self, time_partition: &Option) -> LogicalPlan { let filters = self.filter_tag.clone().and_then(tag_filter); // see https://github.com/apache/arrow-datafusion/pull/8400 // this can be eliminated in later version of datafusion but with slight caveat // transform cannot modify stringified plans by itself // we by knowing this plan is not in the optimization procees chose to overwrite the stringified plan + match self.raw_logical_plan.clone() { LogicalPlan::Explain(plan) => { let transformed = transform( @@ -133,6 +145,7 @@ impl Query { self.start.naive_utc(), self.end.naive_utc(), filters, + time_partition, ); LogicalPlan::Explain(Explain { verbose: plan.verbose, @@ -144,7 +157,13 @@ impl Query { logical_optimization_succeeded: plan.logical_optimization_succeeded, }) } - x => transform(x, self.start.naive_utc(), self.end.naive_utc(), filters), + x => transform( + x, + self.start.naive_utc(), + self.end.naive_utc(), + filters, + time_partition, + ), } } @@ -156,12 +175,12 @@ impl Query { } #[derive(Debug, Default)] -struct TableScanVisitor { +pub(crate) struct TableScanVisitor { tables: Vec, } impl TableScanVisitor { - fn into_inner(self) -> Vec { + pub fn into_inner(self) -> Vec { self.tables } } @@ -195,33 +214,53 @@ fn transform( start_time: NaiveDateTime, end_time: NaiveDateTime, filters: Option, + time_partition: &Option, ) -> LogicalPlan { plan.transform(&|plan| match plan { LogicalPlan::TableScan(table) => { let mut new_filters = vec![]; - if !table_contains_any_time_filters(&table) { - let start_time_filter = PartialTimeFilter::Low(std::ops::Bound::Included( - start_time, - )) - .binary_expr(Expr::Column(Column::new( - Some(table.table_name.to_owned_reference()), - event::DEFAULT_TIMESTAMP_KEY, - ))); - let end_time_filter = PartialTimeFilter::High(std::ops::Bound::Excluded(end_time)) - .binary_expr(Expr::Column(Column::new( - Some(table.table_name.to_owned_reference()), - event::DEFAULT_TIMESTAMP_KEY, - ))); - new_filters.push(start_time_filter); - new_filters.push(end_time_filter); + if !table_contains_any_time_filters(&table, time_partition) { + let mut _start_time_filter: Expr; + let mut _end_time_filter: Expr; + match time_partition { + Some(time_partition) => { + _start_time_filter = + PartialTimeFilter::Low(std::ops::Bound::Included(start_time)) + .binary_expr_timestamp_partition_key(Expr::Column(Column::new( + Some(table.table_name.to_owned_reference()), + time_partition.clone(), + ))); + _end_time_filter = + PartialTimeFilter::High(std::ops::Bound::Excluded(end_time)) + .binary_expr_timestamp_partition_key(Expr::Column(Column::new( + Some(table.table_name.to_owned_reference()), + time_partition, + ))); + } + None => { + _start_time_filter = + PartialTimeFilter::Low(std::ops::Bound::Included(start_time)) + .binary_expr_default_timestamp_key(Expr::Column(Column::new( + Some(table.table_name.to_owned_reference()), + event::DEFAULT_TIMESTAMP_KEY, + ))); + _end_time_filter = + PartialTimeFilter::High(std::ops::Bound::Excluded(end_time)) + .binary_expr_default_timestamp_key(Expr::Column(Column::new( + Some(table.table_name.to_owned_reference()), + event::DEFAULT_TIMESTAMP_KEY, + ))); + } + } + + new_filters.push(_start_time_filter); + new_filters.push(_end_time_filter); } if let Some(tag_filters) = filters.clone() { new_filters.push(tag_filters) } - let new_filter = new_filters.into_iter().reduce(and); - if let Some(new_filter) = new_filter { let filter = Filter::try_new(new_filter, Arc::new(LogicalPlan::TableScan(table))).unwrap(); @@ -235,7 +274,10 @@ fn transform( .expect("transform only transforms the tablescan") } -fn table_contains_any_time_filters(table: &datafusion::logical_expr::TableScan) -> bool { +fn table_contains_any_time_filters( + table: &datafusion::logical_expr::TableScan, + time_partition: &Option, +) -> bool { table .filters .iter() @@ -246,7 +288,11 @@ fn table_contains_any_time_filters(table: &datafusion::logical_expr::TableScan) None } }) - .any(|expr| matches!(&*expr.left, Expr::Column(Column { name, .. }) if (name == event::DEFAULT_TIMESTAMP_KEY))) + .any(|expr| { + matches!(&*expr.left, Expr::Column(Column { name, .. }) + if ((time_partition.is_some() && name == time_partition.as_ref().unwrap()) || + (!time_partition.is_some() && name == event::DEFAULT_TIMESTAMP_KEY))) + }) } #[allow(dead_code)] @@ -290,6 +336,52 @@ fn time_from_path(path: &Path) -> DateTime { .unwrap() } +/// unused for now might need it later +#[allow(unused)] +pub fn flatten_objects_for_count(objects: Vec) -> Vec { + if objects.is_empty() { + return objects; + } + + // check if all the keys start with "COUNT" + let flag = objects.iter().all(|obj| { + obj.as_object() + .unwrap() + .keys() + .all(|key| key.starts_with("COUNT")) + }) && objects.iter().all(|obj| { + obj.as_object() + .unwrap() + .keys() + .all(|key| key == objects[0].as_object().unwrap().keys().next().unwrap()) + }); + + if flag { + let mut accum = 0u64; + let key = objects[0] + .as_object() + .unwrap() + .keys() + .next() + .unwrap() + .clone(); + + for obj in objects { + let count = obj.as_object().unwrap().keys().fold(0, |acc, key| { + let value = obj.as_object().unwrap().get(key).unwrap().as_u64().unwrap(); + acc + value + }); + accum += count; + } + + vec![json!({ + key: accum + })] + } else { + objects + } +} + pub mod error { use crate::storage::ObjectStorageError; use datafusion::error::DataFusionError; @@ -305,6 +397,10 @@ pub mod error { #[cfg(test)] mod tests { + use serde_json::json; + + use crate::query::flatten_objects_for_count; + use super::time_from_path; use std::path::PathBuf; @@ -314,4 +410,82 @@ mod tests { let time = time_from_path(path.as_path()); assert_eq!(time.timestamp(), 1640995200); } + + #[test] + fn test_flat_simple() { + let val = vec![ + json!({ + "COUNT(*)": 1 + }), + json!({ + "COUNT(*)": 2 + }), + json!({ + "COUNT(*)": 3 + }), + ]; + + let out = flatten_objects_for_count(val); + assert_eq!(out, vec![json!({"COUNT(*)": 6})]); + } + + #[test] + fn test_flat_empty() { + let val = vec![]; + let out = flatten_objects_for_count(val.clone()); + assert_eq!(val, out); + } + + #[test] + fn test_flat_same_multi() { + let val = vec![json!({"COUNT(ALPHA)": 1}), json!({"COUNT(ALPHA)": 2})]; + let out = flatten_objects_for_count(val.clone()); + assert_eq!(vec![json!({"COUNT(ALPHA)": 3})], out); + } + + #[test] + fn test_flat_diff_multi() { + let val = vec![json!({"COUNT(ALPHA)": 1}), json!({"COUNT(BETA)": 2})]; + let out = flatten_objects_for_count(val.clone()); + assert_eq!(out, val); + } + + #[test] + fn test_flat_fail() { + let val = vec![ + json!({ + "Num": 1 + }), + json!({ + "Num": 2 + }), + json!({ + "Num": 3 + }), + ]; + + let out = flatten_objects_for_count(val.clone()); + assert_eq!(val, out); + } + + #[test] + fn test_flat_multi_key() { + let val = vec![ + json!({ + "Num": 1, + "COUNT(*)": 1 + }), + json!({ + "Num": 2, + "COUNT(*)": 2 + }), + json!({ + "Num": 3, + "COUNT(*)": 3 + }), + ]; + + let out = flatten_objects_for_count(val.clone()); + assert_eq!(val, out); + } } diff --git a/server/src/query/filter_optimizer.rs b/server/src/query/filter_optimizer.rs index 7a826537a..d78d0fce9 100644 --- a/server/src/query/filter_optimizer.rs +++ b/server/src/query/filter_optimizer.rs @@ -26,7 +26,7 @@ use datafusion::{ scalar::ScalarValue, }; -/// Rewrites logical plan for source using projection and filter +/// Rewrites logical plan for source using projection and filter pub struct FilterOptimizerRule { pub column: String, pub literals: Vec, @@ -117,9 +117,7 @@ impl FilterOptimizerRule { Expr::Column(Column::from_name(&self.column)).like(lit(format!("%{}%", literal))) }); - let Some(mut filter_expr) = patterns.next() else { - return None; - }; + let mut filter_expr = patterns.next()?; for expr in patterns { filter_expr = or(filter_expr, expr) } diff --git a/server/src/query/listing_table_builder.rs b/server/src/query/listing_table_builder.rs index 59bf05a3a..669d53c61 100644 --- a/server/src/query/listing_table_builder.rs +++ b/server/src/query/listing_table_builder.rs @@ -25,7 +25,7 @@ use datafusion::{ listing::{ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl}, }, error::DataFusionError, - logical_expr::col, + logical_expr::{col, Expr}, }; use futures_util::{future, stream::FuturesUnordered, Future, TryStreamExt}; use itertools::Itertools; @@ -183,13 +183,19 @@ impl ListingTableBuilder { self, schema: Arc, map: impl Fn(Vec) -> Vec, + time_partition: Option, ) -> Result>, DataFusionError> { if self.listing.is_empty() { return Ok(None); } - + let file_sort_order: Vec>; let file_format = ParquetFormat::default().with_enable_pruning(Some(true)); - let file_sort_order = vec![vec![col(DEFAULT_TIMESTAMP_KEY).sort(true, false)]]; + if let Some(time_partition) = time_partition { + file_sort_order = vec![vec![col(time_partition).sort(true, false)]]; + } else { + file_sort_order = vec![vec![col(DEFAULT_TIMESTAMP_KEY).sort(true, false)]]; + } + let listing_options = ListingOptions::new(Arc::new(file_format)) .with_file_extension(".parquet") .with_file_sort_order(file_sort_order) diff --git a/server/src/query/stream_schema_provider.rs b/server/src/query/stream_schema_provider.rs index 874afca78..2577593c4 100644 --- a/server/src/query/stream_schema_provider.rs +++ b/server/src/query/stream_schema_provider.rs @@ -16,8 +16,11 @@ * */ -use std::{any::Any, collections::HashMap, ops::Bound, sync::Arc}; - +use crate::Mode; +use crate::{ + catalog::snapshot::{self, Snapshot}, + storage::{ObjectStoreFormat, STREAM_ROOT_DIRECTORY}, +}; use arrow_array::RecordBatch; use arrow_schema::{Schema, SchemaRef, SortOptions}; use bytes::Bytes; @@ -40,18 +43,19 @@ use datafusion::{ optimizer::utils::conjunction, physical_expr::{create_physical_expr, PhysicalSortExpr}, physical_plan::{self, empty::EmptyExec, union::UnionExec, ExecutionPlan, Statistics}, - prelude::{Column, Expr}, + prelude::Expr, scalar::ScalarValue, }; use futures_util::{stream::FuturesOrdered, StreamExt, TryFutureExt, TryStreamExt}; use itertools::Itertools; use object_store::{path::Path, ObjectStore}; +use relative_path::RelativePathBuf; +use std::{any::Any, collections::HashMap, ops::Bound, sync::Arc}; use url::Url; use crate::{ catalog::{ self, column::TypedStatistics, manifest::Manifest, snapshot::ManifestItem, ManifestFile, - Snapshot, }, event::{self, DEFAULT_TIMESTAMP_KEY}, localcache::LocalCacheManager, @@ -62,6 +66,7 @@ use crate::{ }; use super::listing_table_builder::ListingTableBuilder; +use crate::catalog::Snapshot as CatalogSnapshot; // schema provider for stream based on global data pub struct GlobalSchemaProvider { @@ -114,6 +119,7 @@ async fn create_parquet_physical_plan( filters: &[Expr], limit: Option, state: &SessionState, + time_partition: Option, ) -> Result, DataFusionError> { let filters = if let Some(expr) = conjunction(filters.to_vec()) { let table_df_schema = schema.as_ref().clone().to_dfschema()?; @@ -125,14 +131,18 @@ async fn create_parquet_physical_plan( }; let sort_expr = PhysicalSortExpr { - expr: physical_plan::expressions::col(DEFAULT_TIMESTAMP_KEY, &schema)?, + expr: if let Some(time_partition) = time_partition { + physical_plan::expressions::col(&time_partition, &schema)? + } else { + physical_plan::expressions::col(DEFAULT_TIMESTAMP_KEY, &schema)? + }, options: SortOptions { descending: true, nulls_first: true, }, }; - let file_format = ParquetFormat::default().with_enable_pruning(Some(true)); + // create the execution plan let plan = file_format .create_physical_plan( @@ -151,7 +161,6 @@ async fn create_parquet_physical_plan( filters.as_ref(), ) .await?; - Ok(plan) } @@ -209,7 +218,6 @@ fn partitioned_files( let mut partitioned_files = Vec::from_iter((0..target_partition).map(|_| Vec::new())); let mut column_statistics = HashMap::>::new(); let mut count = 0; - for (index, file) in manifest_files .into_iter() .enumerate() @@ -221,7 +229,6 @@ fn partitioned_files( columns, .. } = file; - partitioned_files[index].push(PartitionedFile::new(file_path, file.file_size)); columns.into_iter().for_each(|col| { column_statistics @@ -235,7 +242,6 @@ fn partitioned_files( }); count += num_rows; } - let statistics = table_schema .fields() .iter() @@ -287,13 +293,24 @@ impl TableProvider for StandardTableProvider { ) -> Result, DataFusionError> { let mut memory_exec = None; let mut cache_exec = None; + let object_store = state + .runtime_env() + .object_store_registry + .get_store(&self.url) + .unwrap(); + let glob_storage = CONFIG.storage().get_object_store(); - let time_filters = extract_primary_filter(filters); + let object_store_format = glob_storage + .get_object_store_format(&self.stream) + .await + .map_err(|err| DataFusionError::Plan(err.to_string()))?; + let time_partition = object_store_format.time_partition; + let time_filters = extract_primary_filter(filters, time_partition.clone()); if time_filters.is_empty() { return Err(DataFusionError::Plan("potentially unbounded query on time range. Table scanning requires atleast one time bound".to_string())); } - if include_now(filters) { + if include_now(filters, time_partition.clone()) { if let Some(records) = event::STREAM_WRITERS.recordbatches_cloned(&self.stream, &self.schema) { @@ -305,22 +322,34 @@ impl TableProvider for StandardTableProvider { ); } }; - - let object_store = state - .runtime_env() - .object_store_registry - .get_store(&self.url) - .unwrap(); - let glob_storage = CONFIG.storage().get_object_store(); - - // Fetch snapshot - let snapshot = glob_storage - .get_snapshot(&self.stream) - .await - .map_err(|err| DataFusionError::Plan(err.to_string()))?; + let mut merged_snapshot: snapshot::Snapshot = Snapshot::default(); + if CONFIG.parseable.mode == Mode::Query { + let path = RelativePathBuf::from_iter([&self.stream, STREAM_ROOT_DIRECTORY]); + let obs = glob_storage + .get_objects( + Some(&path), + Box::new(|file_name| file_name.starts_with(".ingestor")), + ) + .await; + + if let Ok(obs) = obs { + for ob in obs { + if let Ok(object_store_format) = + serde_json::from_slice::(&ob) + { + let snapshot = object_store_format.snapshot; + for manifest in snapshot.manifest_list { + merged_snapshot.manifest_list.push(manifest); + } + } + } + } + } else { + merged_snapshot = object_store_format.snapshot; + } // Is query timerange is overlapping with older data. - if is_overlapping_query(&snapshot.manifest_list, &time_filters) { + if is_overlapping_query(&merged_snapshot.manifest_list, &time_filters) { return legacy_listing_table( self.stream.clone(), memory_exec, @@ -332,12 +361,19 @@ impl TableProvider for StandardTableProvider { projection, filters, limit, + time_partition.clone(), ) .await; } - let mut manifest_files = - collect_from_snapshot(&snapshot, &time_filters, object_store, filters, limit).await?; + let mut manifest_files = collect_from_snapshot( + &merged_snapshot, + &time_filters, + object_store, + filters, + limit, + ) + .await?; if manifest_files.is_empty() { return final_plan(vec![memory_exec], projection, self.schema.clone()); @@ -374,6 +410,7 @@ impl TableProvider for StandardTableProvider { filters, limit, state, + time_partition.clone(), ) .await?; @@ -399,6 +436,7 @@ impl TableProvider for StandardTableProvider { filters, limit, state, + time_partition.clone(), ) .await?; @@ -436,11 +474,16 @@ async fn legacy_listing_table( projection: Option<&Vec>, filters: &[Expr], limit: Option, + time_partition: Option, ) -> Result, DataFusionError> { let remote_table = ListingTableBuilder::new(stream) .populate_via_listing(glob_storage.clone(), object_store, time_filters) .and_then(|builder| async { - let table = builder.build(schema.clone(), |x| glob_storage.query_prefixes(x))?; + let table = builder.build( + schema.clone(), + |x| glob_storage.query_prefixes(x), + time_partition, + )?; let res = match table { Some(table) => Some(table.scan(state, projection, filters, limit).await?), _ => None, @@ -458,6 +501,7 @@ fn final_plan( schema: Arc, ) -> Result, DataFusionError> { let mut execution_plans = execution_plans.into_iter().flatten().collect_vec(); + let exec: Arc = if execution_plans.is_empty() { let schema = match projection { Some(projection) => Arc::new(schema.project(projection)?), @@ -469,7 +513,6 @@ fn final_plan( } else { Arc::new(UnionExec::new(execution_plans)) }; - Ok(exec) } @@ -492,11 +535,11 @@ pub enum PartialTimeFilter { } impl PartialTimeFilter { - fn try_from_expr(expr: &Expr) -> Option { + fn try_from_expr(expr: &Expr, time_partition: Option) -> Option { let Expr::BinaryExpr(binexpr) = expr else { return None; }; - let (op, time) = extract_timestamp_bound(binexpr)?; + let (op, time) = extract_timestamp_bound(binexpr.clone(), time_partition)?; let value = match op { Operator::Gt => PartialTimeFilter::Low(Bound::Excluded(time)), Operator::GtEq => PartialTimeFilter::Low(Bound::Included(time)), @@ -509,7 +552,7 @@ impl PartialTimeFilter { Some(value) } - pub fn binary_expr(&self, left: Expr) -> Expr { + pub fn binary_expr_default_timestamp_key(&self, left: Expr) -> Expr { let (op, right) = match self { PartialTimeFilter::Low(Bound::Excluded(time)) => { (Operator::Gt, time.timestamp_millis()) @@ -537,15 +580,24 @@ impl PartialTimeFilter { )) } - fn is_greater_than(&self, other: &NaiveDateTime) -> bool { - match self { - PartialTimeFilter::Low(Bound::Excluded(time)) => time >= other, - PartialTimeFilter::Low(Bound::Included(time)) - | PartialTimeFilter::High(Bound::Excluded(time)) - | PartialTimeFilter::High(Bound::Included(time)) => time > other, - PartialTimeFilter::Eq(time) => time > other, + pub fn binary_expr_timestamp_partition_key(&self, left: Expr) -> Expr { + let (op, right) = match self { + PartialTimeFilter::Low(Bound::Excluded(time)) => (Operator::Gt, time), + PartialTimeFilter::Low(Bound::Included(time)) => (Operator::GtEq, time), + PartialTimeFilter::High(Bound::Excluded(time)) => (Operator::Lt, time), + PartialTimeFilter::High(Bound::Included(time)) => (Operator::LtEq, time), + PartialTimeFilter::Eq(time) => (Operator::Eq, time), _ => unimplemented!(), - } + }; + + Expr::BinaryExpr(BinaryExpr::new( + Box::new(left), + op, + Box::new(Expr::Literal(ScalarValue::Utf8(Some(format!( + "{:?}", + right + ))))), + )) } } @@ -554,26 +606,36 @@ fn is_overlapping_query( time_filters: &[PartialTimeFilter], ) -> bool { // This is for backwards compatiblity. Older table format relies on listing. - // if the time is lower than upper bound of first file then we consider it overlapping - let Some(first_entry_upper_bound) = - manifest_list.iter().map(|file| file.time_upper_bound).min() + // if the start time is lower than lower bound of first file then we consider it overlapping + let Some(first_entry_lower_bound) = + manifest_list.iter().map(|file| file.time_lower_bound).min() else { return true; }; - !time_filters - .iter() - .all(|filter| filter.is_greater_than(&first_entry_upper_bound.naive_utc())) + for filter in time_filters { + match filter { + PartialTimeFilter::Low(Bound::Excluded(time)) + | PartialTimeFilter::Low(Bound::Included(time)) => { + if time < &first_entry_lower_bound.naive_utc() { + return true; + } + } + _ => {} + } + } + + false } -fn include_now(filters: &[Expr]) -> bool { +fn include_now(filters: &[Expr], time_partition: Option) -> bool { let current_minute = Utc::now() .with_second(0) .and_then(|x| x.with_nanosecond(0)) .expect("zeroed value is valid") .naive_utc(); - let time_filters = extract_primary_filter(filters); + let time_filters = extract_primary_filter(filters, time_partition); let upper_bound_matches = time_filters.iter().any(|filter| match filter { PartialTimeFilter::High(Bound::Excluded(time)) @@ -598,7 +660,7 @@ fn expr_in_boundary(filter: &Expr) -> bool { let Expr::BinaryExpr(binexpr) = filter else { return false; }; - let Some((op, time)) = extract_timestamp_bound(binexpr) else { + let Some((op, time)) = extract_timestamp_bound(binexpr.clone(), None) else { return false; }; @@ -612,11 +674,22 @@ fn expr_in_boundary(filter: &Expr) -> bool { ) } -fn extract_from_lit(expr: &Expr) -> Option { - if let Expr::Literal(value) = expr { +fn extract_from_lit(expr: BinaryExpr, time_partition: Option) -> Option { + let mut column_name: String = String::default(); + if let Expr::Column(column) = *expr.left { + column_name = column.name; + } + if let Expr::Literal(value) = *expr.right { match value { ScalarValue::TimestampMillisecond(Some(value), _) => { - Some(NaiveDateTime::from_timestamp_millis(*value).unwrap()) + Some(NaiveDateTime::from_timestamp_millis(value).unwrap()) + } + ScalarValue::Utf8(Some(str_value)) => { + if time_partition.is_some() && column_name == time_partition.unwrap() { + Some(str_value.parse::().unwrap()) + } else { + None + } } _ => None, } @@ -625,14 +698,11 @@ fn extract_from_lit(expr: &Expr) -> Option { } } -fn extract_timestamp_bound(binexpr: &BinaryExpr) -> Option<(Operator, NaiveDateTime)> { - if matches!(&*binexpr.left, Expr::Column(Column { name, .. }) if name == DEFAULT_TIMESTAMP_KEY) - { - let time = extract_from_lit(&binexpr.right)?; - Some((binexpr.op, time)) - } else { - None - } +fn extract_timestamp_bound( + binexpr: BinaryExpr, + time_partition: Option, +) -> Option<(Operator, NaiveDateTime)> { + Some((binexpr.op, extract_from_lit(binexpr, time_partition)?)) } async fn collect_manifest_files( @@ -658,11 +728,14 @@ async fn collect_manifest_files( } // extract start time and end time from filter preficate -fn extract_primary_filter(filters: &[Expr]) -> Vec { +fn extract_primary_filter( + filters: &[Expr], + time_partition: Option, +) -> Vec { let mut time_filters = Vec::new(); filters.iter().for_each(|expr| { let _ = expr.apply(&mut |expr| { - let time = PartialTimeFilter::try_from_expr(expr); + let time = PartialTimeFilter::try_from_expr(expr, time_partition.clone()); if let Some(time) = time { time_filters.push(time); Ok(VisitRecursion::Stop) @@ -830,7 +903,7 @@ mod tests { let res = is_overlapping_query( &manifest_items(), &[PartialTimeFilter::Low(std::ops::Bound::Included( - datetime_min(2023, 12, 15).naive_utc(), + datetime_min(2023, 12, 14).naive_utc(), ))], ); @@ -842,7 +915,7 @@ mod tests { let res = is_overlapping_query( &manifest_items(), &[PartialTimeFilter::Low(std::ops::Bound::Included( - datetime_min(2023, 12, 15) + datetime_min(2023, 12, 14) .naive_utc() .add(Duration::hours(3)), ))], diff --git a/server/src/rbac/map.rs b/server/src/rbac/map.rs index 6a82b9f9d..b5b92e5a7 100644 --- a/server/src/rbac/map.rs +++ b/server/src/rbac/map.rs @@ -159,9 +159,7 @@ impl Sessions { // remove a specific session pub fn remove_session(&mut self, key: &SessionKey) -> Option { - let Some((user, _)) = self.active_sessions.remove(key) else { - return None; - }; + let (user, _) = self.active_sessions.remove(key)?; if let Some(items) = self.user_sessions.get_mut(&user) { items.retain(|(session, _)| session != key); diff --git a/server/src/rbac/role.rs b/server/src/rbac/role.rs index 43b5160cf..ee17bea5c 100644 --- a/server/src/rbac/role.rs +++ b/server/src/rbac/role.rs @@ -24,6 +24,7 @@ pub enum Action { Query, CreateStream, ListStream, + GetStream, GetSchema, GetStats, DeleteStream, @@ -44,7 +45,11 @@ pub enum Action { ListRole, GetAbout, QueryLLM, + ListCluster, + ListClusterMetrics, + Deleteingestor, All, + GetAnalytics, } #[derive(Debug, Clone, PartialEq, Eq, Hash)] @@ -97,7 +102,12 @@ impl RoleBuilder { | Action::ListRole | Action::CreateStream | Action::DeleteStream - | Action::ListStream => Permission::Unit(action), + | Action::GetStream + | Action::ListStream + | Action::ListCluster + | Action::ListClusterMetrics + | Action::Deleteingestor + | Action::GetAnalytics => Permission::Unit(action), Action::Ingest | Action::GetSchema | Action::GetStats @@ -128,7 +138,7 @@ pub mod model { Admin, Editor, Writer { stream: String }, - Ingester { stream: String }, + Ingestor { stream: String }, Reader { stream: String, tag: Option }, } @@ -147,7 +157,7 @@ pub mod model { } reader } - DefaultPrivilege::Ingester { stream } => { + DefaultPrivilege::Ingestor { stream } => { ingest_perm_builder().with_stream(stream.to_owned()) } } @@ -169,6 +179,7 @@ pub mod model { Action::Query, Action::CreateStream, Action::ListStream, + Action::GetStream, Action::GetSchema, Action::GetStats, Action::GetRetention, @@ -191,6 +202,7 @@ pub mod model { Action::Ingest, Action::Query, Action::ListStream, + Action::GetStream, Action::GetSchema, Action::GetStats, Action::GetRetention, @@ -209,12 +221,14 @@ pub mod model { actions: vec![ Action::Query, Action::ListStream, + Action::GetStream, Action::GetSchema, Action::GetStats, Action::GetRetention, Action::GetAlert, Action::GetAbout, Action::QueryLLM, + Action::ListCluster, ], stream: None, tag: None, diff --git a/server/src/rbac/user.rs b/server/src/rbac/user.rs index 6780c0fe9..c037a8381 100644 --- a/server/src/rbac/user.rs +++ b/server/src/rbac/user.rs @@ -60,7 +60,7 @@ impl User { pub fn new_oauth(username: String, roles: HashSet, user_info: UserInfo) -> Self { Self { ty: UserType::OAuth(OAuth { - userid: username, + userid: user_info.name.clone().unwrap_or(username), user_info, }), roles, diff --git a/server/src/response.rs b/server/src/response.rs index 18b86d78f..c6731e3a1 100644 --- a/server/src/response.rs +++ b/server/src/response.rs @@ -44,6 +44,7 @@ impl QueryResponse { } } let values = json_records.into_iter().map(Value::Object).collect_vec(); + let response = if self.with_fields { json!({ "fields": self.fields, diff --git a/server/src/static_schema.rs b/server/src/static_schema.rs new file mode 100644 index 000000000..cbf48b7ae --- /dev/null +++ b/server/src/static_schema.rs @@ -0,0 +1,143 @@ +use crate::event::{DEFAULT_METADATA_KEY, DEFAULT_TAGS_KEY, DEFAULT_TIMESTAMP_KEY}; +use crate::utils::arrow::get_field; +use anyhow::{anyhow, Error as AnyError}; +use serde::{Deserialize, Serialize}; +use std::str; + +use arrow_schema::{DataType, Field, Schema, TimeUnit}; +use std::{collections::HashMap, sync::Arc}; +#[derive(Serialize, Deserialize, Debug)] +pub struct StaticSchema { + fields: Vec, +} + +#[derive(Serialize, Deserialize, Debug)] +pub struct SchemaFields { + name: String, + data_type: String, +} +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ParsedSchema { + pub fields: Vec, + pub metadata: HashMap, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct Fields { + name: String, + data_type: DataType, + nullable: bool, + dict_id: i64, + dict_is_ordered: bool, + metadata: HashMap, +} + +#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] + +pub struct Metadata {} +pub fn convert_static_schema_to_arrow_schema( + static_schema: StaticSchema, +) -> Result, AnyError> { + let mut parsed_schema = ParsedSchema { + fields: Vec::new(), + metadata: HashMap::new(), + }; + for field in static_schema.fields.iter() { + let parsed_field = Fields { + name: field.name.clone(), + data_type: { + match field.data_type.as_str() { + "int" => DataType::Int64, + "double" | "float" => DataType::Float64, + "boolean" => DataType::Boolean, + "string" => DataType::Utf8, + "datetime" => DataType::Timestamp(TimeUnit::Millisecond, None), + "string_list" => { + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))) + } + "int_list" => { + DataType::List(Arc::new(Field::new("item", DataType::Int64, true))) + } + "double_list" | "float_list" => { + DataType::List(Arc::new(Field::new("item", DataType::Float64, true))) + } + "boolean_list" => { + DataType::List(Arc::new(Field::new("item", DataType::Boolean, true))) + } + _ => DataType::Null, + } + }, + nullable: default_nullable(), + dict_id: default_dict_id(), + dict_is_ordered: default_dict_is_ordered(), + metadata: HashMap::new(), + }; + parsed_schema.fields.push(parsed_field); + } + let schema = add_parseable_fields_to_static_schema(parsed_schema); + if schema.is_err() { + return Err(schema.err().unwrap()); + } + Ok(schema.unwrap()) +} + +fn add_parseable_fields_to_static_schema( + parsed_schema: ParsedSchema, +) -> Result, AnyError> { + let mut schema: Vec> = Vec::new(); + for field in parsed_schema.fields.iter() { + let field = Field::new(field.name.clone(), field.data_type.clone(), field.nullable); + schema.push(Arc::new(field)); + } + if get_field(&schema, DEFAULT_TAGS_KEY).is_some() { + return Err(anyhow!("field {} is a reserved field", DEFAULT_TAGS_KEY)); + }; + + if get_field(&schema, DEFAULT_METADATA_KEY).is_some() { + return Err(anyhow!( + "field {} is a reserved field", + DEFAULT_METADATA_KEY + )); + }; + + if get_field(&schema, DEFAULT_TIMESTAMP_KEY).is_some() { + return Err(anyhow!( + "field {} is a reserved field", + DEFAULT_TIMESTAMP_KEY + )); + }; + + // add the p_timestamp field to the event schema to the 0th index + schema.insert( + 0, + Arc::new(Field::new( + DEFAULT_TIMESTAMP_KEY, + DataType::Timestamp(TimeUnit::Millisecond, None), + true, + )), + ); + + // p_tags and p_metadata are added to the end of the schema + schema.push(Arc::new(Field::new(DEFAULT_TAGS_KEY, DataType::Utf8, true))); + schema.push(Arc::new(Field::new( + DEFAULT_METADATA_KEY, + DataType::Utf8, + true, + ))); + + // prepare the record batch and new fields to be added + let schema = Arc::new(Schema::new(schema)); + Ok(schema) +} + +fn default_nullable() -> bool { + true +} +fn default_dict_id() -> i64 { + 0 +} +fn default_dict_is_ordered() -> bool { + false +} diff --git a/server/src/storage.rs b/server/src/storage.rs index 975fcf445..f1efb5da8 100644 --- a/server/src/storage.rs +++ b/server/src/storage.rs @@ -24,7 +24,7 @@ use std::fmt::Debug; mod localfs; mod metrics_layer; -mod object_storage; +pub(crate) mod object_storage; pub mod retention; mod s3; pub mod staging; @@ -37,8 +37,18 @@ pub use store_metadata::{ put_remote_metadata, put_staging_metadata, resolve_parseable_metadata, StorageMetadata, }; +use self::retention::Retention; pub use self::staging::StorageDir; +// metadata file names in a Stream prefix +pub const STREAM_METADATA_FILE_NAME: &str = ".stream.json"; +pub const PARSEABLE_METADATA_FILE_NAME: &str = ".parseable.json"; +pub const STREAM_ROOT_DIRECTORY: &str = ".stream"; +pub const PARSEABLE_ROOT_DIRECTORY: &str = ".parseable"; +pub const SCHEMA_FILE_NAME: &str = ".schema"; +pub const ALERT_FILE_NAME: &str = ".alert.json"; +pub const MANIFEST_FILE: &str = "manifest.json"; + /// local sync interval to move data.records to /tmp dir of that stream. /// 60 sec is a reasonable value. pub const LOCAL_SYNC_INTERVAL: u64 = 60; @@ -69,6 +79,9 @@ pub struct ObjectStoreFormat { pub objectstore_format: String, #[serde(rename = "created-at")] pub created_at: String, + #[serde(rename = "first-event-at")] + #[serde(skip_serializing_if = "Option::is_none")] + pub first_event_at: Option, pub owner: Owner, pub permissions: Vec, pub stats: Stats, @@ -76,6 +89,27 @@ pub struct ObjectStoreFormat { pub snapshot: Snapshot, #[serde(default)] pub cache_enabled: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub retention: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub time_partition: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub static_schema_flag: Option, +} + +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct StreamInfo { + #[serde(rename = "created-at")] + pub created_at: String, + #[serde(rename = "first-event-at")] + #[serde(skip_serializing_if = "Option::is_none")] + pub first_event_at: Option, + #[serde(default)] + pub cache_enabled: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub time_partition: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub static_schema_flag: Option, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -113,11 +147,15 @@ impl Default for ObjectStoreFormat { version: CURRENT_SCHEMA_VERSION.to_string(), objectstore_format: CURRENT_OBJECT_STORE_VERSION.to_string(), created_at: Local::now().to_rfc3339(), + first_event_at: None, owner: Owner::new("".to_string(), "".to_string()), permissions: vec![Permisssion::new("parseable".to_string())], stats: Stats::default(), snapshot: Snapshot::default(), cache_enabled: false, + retention: None, + time_partition: None, + static_schema_flag: None, } } } @@ -129,7 +167,7 @@ impl ObjectStoreFormat { } } -#[derive(serde::Serialize)] +#[derive(serde::Serialize, PartialEq)] pub struct LogStream { pub name: String, } @@ -139,6 +177,8 @@ pub enum ObjectStorageError { // no such key inside the object storage #[error("{0} not found")] NoSuchKey(String), + #[error("Invalid Request: {0}")] + Invalid(#[from] anyhow::Error), // custom #[error("{0}")] @@ -158,6 +198,8 @@ pub enum ObjectStorageError { #[error("Unhandled Error: {0}")] UnhandledError(Box), + #[error("Error: {0}")] + PathError(relative_path::FromPathError), #[allow(dead_code)] #[error("Authentication Error: {0}")] diff --git a/server/src/storage/localfs.rs b/server/src/storage/localfs.rs index df88499a9..fc7bd7f23 100644 --- a/server/src/storage/localfs.rs +++ b/server/src/storage/localfs.rs @@ -27,14 +27,17 @@ use bytes::Bytes; use datafusion::{datasource::listing::ListingTableUrl, execution::runtime_env::RuntimeConfig}; use fs_extra::file::CopyOptions; use futures::{stream::FuturesUnordered, TryStreamExt}; -use relative_path::RelativePath; +use relative_path::{RelativePath, RelativePathBuf}; use tokio::fs::{self, DirEntry}; use tokio_stream::wrappers::ReadDirStream; use crate::metrics::storage::{localfs::REQUEST_RESPONSE_TIME, StorageMetrics}; -use crate::{option::validation, utils::validate_path_is_writeable}; +use crate::option::validation; -use super::{object_storage, LogStream, ObjectStorage, ObjectStorageError, ObjectStorageProvider}; +use super::{ + LogStream, ObjectStorage, ObjectStorageError, ObjectStorageProvider, PARSEABLE_ROOT_DIRECTORY, + SCHEMA_FILE_NAME, STREAM_METADATA_FILE_NAME, STREAM_ROOT_DIRECTORY, +}; #[derive(Debug, Clone, clap::Args)] #[command( @@ -74,6 +77,7 @@ impl ObjectStorageProvider for FSConfig { } pub struct LocalFS { + // absolute path of the data directory root: PathBuf, } @@ -110,6 +114,129 @@ impl ObjectStorage for LocalFS { res } + async fn get_ingestor_meta_file_paths( + &self, + ) -> Result, ObjectStorageError> { + let time = Instant::now(); + + let mut path_arr = vec![]; + let mut entries = fs::read_dir(&self.root).await?; + + while let Some(entry) = entries.next_entry().await? { + let flag = entry + .path() + .file_name() + .unwrap_or_default() + .to_str() + .unwrap_or_default() + .contains("ingestor"); + + if flag { + path_arr.push( + RelativePathBuf::from_path(entry.path().file_name().unwrap()) + .map_err(ObjectStorageError::PathError)?, + ); + } + } + + let time = time.elapsed().as_secs_f64(); + REQUEST_RESPONSE_TIME + .with_label_values(&["GET", "200"]) // this might not be the right status code + .observe(time); + + Ok(path_arr) + } + + async fn get_stream_file_paths( + &self, + stream_name: &str, + ) -> Result, ObjectStorageError> { + let time = Instant::now(); + let mut path_arr = vec![]; + + // = data/stream_name + let stream_dir_path = self.path_in_root(&RelativePathBuf::from(stream_name)); + let mut entries = fs::read_dir(&stream_dir_path).await?; + + while let Some(entry) = entries.next_entry().await? { + let flag = entry + .path() + .file_name() + .ok_or(ObjectStorageError::NoSuchKey( + "Dir Entry Suggests no file present".to_string(), + ))? + .to_str() + .expect("file name is parseable to str") + .contains("ingestor"); + + if flag { + path_arr.push(RelativePathBuf::from_iter([ + stream_name, + entry.path().file_name().unwrap().to_str().unwrap(), // checking the error before hand + ])); + } + } + + path_arr.push(RelativePathBuf::from_iter([ + stream_name, + STREAM_METADATA_FILE_NAME, + ])); + path_arr.push(RelativePathBuf::from_iter([stream_name, SCHEMA_FILE_NAME])); + + let time = time.elapsed().as_secs_f64(); + REQUEST_RESPONSE_TIME + .with_label_values(&["GET", "200"]) // this might not be the right status code + .observe(time); + + Ok(path_arr) + } + + /// currently it is not using the starts_with_pattern + async fn get_objects( + &self, + base_path: Option<&RelativePath>, + filter_func: Box<(dyn Fn(String) -> bool + std::marker::Send + 'static)>, + ) -> Result, ObjectStorageError> { + let time = Instant::now(); + + let prefix = if let Some(path) = base_path { + path.to_path(&self.root) + } else { + self.root.clone() + }; + + let mut entries = fs::read_dir(&prefix).await?; + let mut res = Vec::new(); + while let Some(entry) = entries.next_entry().await? { + let path = entry + .path() + .file_name() + .ok_or(ObjectStorageError::NoSuchKey( + "Dir Entry suggests no file present".to_string(), + ))? + .to_str() + .expect("file name is parseable to str") + .to_owned(); + let ingestor_file = filter_func(path); + + if !ingestor_file { + continue; + } + + let file = fs::read(entry.path()).await?; + res.push(file.into()); + } + + // maybe change the return code + let status = if res.is_empty() { "200" } else { "400" }; + let time = time.elapsed().as_secs_f64(); + REQUEST_RESPONSE_TIME + .with_label_values(&["GET", status]) + .observe(time); + + Ok(res) + } + async fn put_object( &self, path: &RelativePath, @@ -138,9 +265,15 @@ impl ObjectStorage for LocalFS { Ok(()) } + async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { + let path = self.path_in_root(path); + tokio::fs::remove_file(path).await?; + Ok(()) + } + async fn check(&self) -> Result<(), ObjectStorageError> { - fs::create_dir_all(&self.root).await?; - validate_path_is_writeable(&self.root) + fs::create_dir_all(&self.root) + .await .map_err(|e| ObjectStorageError::UnhandledError(e.into())) } @@ -149,8 +282,16 @@ impl ObjectStorage for LocalFS { Ok(fs::remove_dir_all(path).await?) } + async fn try_delete_ingestor_meta( + &self, + ingestor_filename: String, + ) -> Result<(), ObjectStorageError> { + let path = self.root.join(ingestor_filename); + Ok(fs::remove_file(path).await?) + } + async fn list_streams(&self) -> Result, ObjectStorageError> { - let ignore_dir = &["lost+found"]; + let ignore_dir = &["lost+found", PARSEABLE_ROOT_DIRECTORY]; let directories = ReadDirStream::new(fs::read_dir(&self.root).await?); let entries: Vec = directories.try_collect().await?; let entries = entries @@ -169,6 +310,43 @@ impl ObjectStorage for LocalFS { Ok(logstreams) } + async fn list_old_streams(&self) -> Result, ObjectStorageError> { + let ignore_dir = &["lost+found", PARSEABLE_ROOT_DIRECTORY]; + let directories = ReadDirStream::new(fs::read_dir(&self.root).await?); + let entries: Vec = directories.try_collect().await?; + let entries = entries + .into_iter() + .map(|entry| dir_with_old_stream(entry, ignore_dir)); + + let logstream_dirs: Vec> = + FuturesUnordered::from_iter(entries).try_collect().await?; + + let logstreams = logstream_dirs + .into_iter() + .flatten() + .map(|name| LogStream { name }) + .collect(); + + Ok(logstreams) + } + + async fn list_dirs(&self) -> Result, ObjectStorageError> { + let dirs = ReadDirStream::new(fs::read_dir(&self.root).await?) + .try_collect::>() + .await? + .into_iter() + .map(dir_name); + + let dirs = FuturesUnordered::from_iter(dirs) + .try_collect::>() + .await? + .into_iter() + .flatten() + .collect::>(); + + Ok(dirs) + } + async fn list_dates(&self, stream_name: &str) -> Result, ObjectStorageError> { let path = self.root.join(stream_name); let directories = ReadDirStream::new(fs::read_dir(&path).await?); @@ -195,7 +373,8 @@ impl ObjectStorage for LocalFS { fn absolute_url(&self, prefix: &RelativePath) -> object_store::path::Path { object_store::path::Path::parse( - format!("{}", self.root.join(prefix.as_str()).display()).trim_start_matches('/'), + format!("{}", self.root.join(prefix.as_str()).display()) + .trim_start_matches(std::path::MAIN_SEPARATOR), ) .unwrap() } @@ -210,6 +389,50 @@ impl ObjectStorage for LocalFS { fn store_url(&self) -> url::Url { url::Url::parse("file:///").unwrap() } + + fn get_bucket_name(&self) -> String { + self.root + .iter() + .last() + .expect("can be unwrapped without checking as the path is absolute") + .to_str() + .expect("valid unicode") + .to_string() + } +} + +async fn dir_with_old_stream( + entry: DirEntry, + ignore_dirs: &[&str], +) -> Result, ObjectStorageError> { + let dir_name = entry + .path() + .file_name() + .expect("valid path") + .to_str() + .expect("valid unicode") + .to_owned(); + + if ignore_dirs.contains(&dir_name.as_str()) { + return Ok(None); + } + + if entry.file_type().await?.is_dir() { + let path = entry.path(); + + // even in ingest mode, we should only look for the global stream metadata file + let stream_json_path = path.join(STREAM_METADATA_FILE_NAME); + + if stream_json_path.exists() { + Ok(Some(dir_name)) + } else { + let err: Box = + format!("found {}", entry.path().display()).into(); + Err(ObjectStorageError::UnhandledError(err)) + } + } else { + Ok(None) + } } async fn dir_with_stream( @@ -230,7 +453,12 @@ async fn dir_with_stream( if entry.file_type().await?.is_dir() { let path = entry.path(); - let stream_json_path = path.join(object_storage::STREAM_METADATA_FILE_NAME); + + // even in ingest mode, we should only look for the global stream metadata file + let stream_json_path = path + .join(STREAM_ROOT_DIRECTORY) + .join(STREAM_METADATA_FILE_NAME); + if stream_json_path.exists() { Ok(Some(dir_name)) } else { diff --git a/server/src/storage/object_storage.rs b/server/src/storage/object_storage.rs index be4b1c1c6..77eb9f20d 100644 --- a/server/src/storage/object_storage.rs +++ b/server/src/storage/object_storage.rs @@ -20,6 +20,13 @@ use super::{ retention::Retention, staging::convert_disk_files_to_parquet, LogStream, ObjectStorageError, ObjectStoreFormat, Permisssion, StorageDir, StorageMetadata, }; +use super::{ + ALERT_FILE_NAME, MANIFEST_FILE, PARSEABLE_METADATA_FILE_NAME, PARSEABLE_ROOT_DIRECTORY, + SCHEMA_FILE_NAME, STREAM_METADATA_FILE_NAME, STREAM_ROOT_DIRECTORY, +}; + +use crate::handlers::http::modal::ingest_server::INGESTOR_META; +use crate::option::Mode; use crate::{ alerts::Alerts, @@ -49,13 +56,6 @@ use std::{ time::{Duration, Instant}, }; -// metadata file names in a Stream prefix -pub(super) const STREAM_METADATA_FILE_NAME: &str = ".stream.json"; -pub(super) const PARSEABLE_METADATA_FILE_NAME: &str = ".parseable.json"; -const SCHEMA_FILE_NAME: &str = ".schema"; -const ALERT_FILE_NAME: &str = ".alert.json"; -const MANIFEST_FILE: &str = "manifest.json"; - pub trait ObjectStorageProvider: StorageMetrics + std::fmt::Debug { fn get_datafusion_runtime(&self) -> RuntimeConfig; fn get_object_store(&self) -> Arc; @@ -66,6 +66,12 @@ pub trait ObjectStorageProvider: StorageMetrics + std::fmt::Debug { #[async_trait] pub trait ObjectStorage: Sync + 'static { async fn get_object(&self, path: &RelativePath) -> Result; + // want to make it more generic with a filter function + async fn get_objects( + &self, + base_path: Option<&RelativePath>, + filter_fun: Box bool + Send>, + ) -> Result, ObjectStorageError>; async fn put_object( &self, path: &RelativePath, @@ -75,16 +81,28 @@ pub trait ObjectStorage: Sync + 'static { async fn check(&self) -> Result<(), ObjectStorageError>; async fn delete_stream(&self, stream_name: &str) -> Result<(), ObjectStorageError>; async fn list_streams(&self) -> Result, ObjectStorageError>; + async fn list_old_streams(&self) -> Result, ObjectStorageError>; + async fn list_dirs(&self) -> Result, ObjectStorageError>; async fn list_dates(&self, stream_name: &str) -> Result, ObjectStorageError>; async fn upload_file(&self, key: &str, path: &Path) -> Result<(), ObjectStorageError>; - + async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError>; + async fn get_ingestor_meta_file_paths( + &self, + ) -> Result, ObjectStorageError>; + async fn get_stream_file_paths( + &self, + stream_name: &str, + ) -> Result, ObjectStorageError>; + async fn try_delete_ingestor_meta( + &self, + ingestor_filename: String, + ) -> Result<(), ObjectStorageError>; /// Returns the amount of time taken by the `ObjectStore` to perform a get /// call. async fn get_latency(&self) -> Duration { // It's Ok to `unwrap` here. The hardcoded value will always Result in // an `Ok`. - let path = RelativePathBuf::from_path(".parseable.json").unwrap(); - + let path = parseable_json_path(); let start = Instant::now(); let _ = self.get_object(&path).await; start.elapsed() @@ -105,15 +123,29 @@ pub trait ObjectStorage: Sync + 'static { Ok(()) } - async fn create_stream(&self, stream_name: &str) -> Result<(), ObjectStorageError> { + async fn create_stream( + &self, + stream_name: &str, + time_partition: &str, + static_schema_flag: &str, + schema: Arc, + ) -> Result<(), ObjectStorageError> { let mut format = ObjectStoreFormat::default(); format.set_id(CONFIG.parseable.username.clone()); let permission = Permisssion::new(CONFIG.parseable.username.clone()); format.permissions = vec![permission]; - + if time_partition.is_empty() { + format.time_partition = None; + } else { + format.time_partition = Some(time_partition.to_string()); + } + if static_schema_flag != "true" { + format.static_schema_flag = None; + } else { + format.static_schema_flag = Some(static_schema_flag.to_string()); + } let format_json = to_bytes(&format); - - self.put_object(&schema_path(stream_name), to_bytes(&Schema::empty())) + self.put_object(&schema_path(stream_name), to_bytes(&schema)) .await?; self.put_object(&stream_json_path(stream_name), format_json) @@ -168,6 +200,35 @@ pub trait ObjectStorage: Sync + 'static { .await } + async fn get_schema_on_server_start( + &self, + stream_name: &str, + ) -> Result { + // try get my schema + // if fails get the base schema + // put the schema to storage?? + let schema_path = schema_path(stream_name); + let byte_data = match self.get_object(&schema_path).await { + Ok(bytes) => bytes, + Err(err) => { + log::info!("{:?}", err); + // base schema path + let schema_path = RelativePathBuf::from_iter([ + stream_name, + STREAM_ROOT_DIRECTORY, + SCHEMA_FILE_NAME, + ]); + let data = self.get_object(&schema_path).await?; + // schema was not found in store, so it needs to be placed + self.put_schema(stream_name, &serde_json::from_slice(&data)?) + .await?; + + data + } + }; + Ok(serde_json::from_slice(&byte_data)?) + } + async fn get_schema(&self, stream_name: &str) -> Result { let schema_map = self.get_object(&schema_path(stream_name)).await?; Ok(serde_json::from_slice(&schema_map)?) @@ -194,7 +255,31 @@ pub trait ObjectStorage: Sync + 'static { &self, stream_name: &str, ) -> Result { - let stream_metadata = self.get_object(&stream_json_path(stream_name)).await?; + let stream_metadata = match self.get_object(&stream_json_path(stream_name)).await { + Ok(data) => data, + Err(_) => { + // get the base stream metadata + let bytes = self + .get_object(&RelativePathBuf::from_iter([ + stream_name, + STREAM_ROOT_DIRECTORY, + STREAM_METADATA_FILE_NAME, + ])) + .await?; + + let mut config = serde_json::from_slice::(&bytes) + .expect("parseable config is valid json"); + + if CONFIG.parseable.mode == Mode::Ingest { + config.stats = Stats::default(); + config.snapshot.manifest_list = vec![]; + } + + self.put_stream_manifest(stream_name, &config).await?; + bytes + } + }; + Ok(serde_json::from_slice(&stream_metadata).expect("parseable config is valid json")) } @@ -207,6 +292,22 @@ pub trait ObjectStorage: Sync + 'static { self.put_object(&path, to_bytes(manifest)).await } + /// for future use + async fn get_stats_for_first_time( + &self, + stream_name: &str, + ) -> Result { + let path = RelativePathBuf::from_iter([stream_name, STREAM_METADATA_FILE_NAME]); + let stream_metadata = self.get_object(&path).await?; + let stream_metadata: Value = + serde_json::from_slice(&stream_metadata).expect("parseable config is valid json"); + let stats = &stream_metadata["stats"]; + + let stats = serde_json::from_value(stats.clone()).unwrap_or_default(); + + Ok(stats) + } + async fn get_stats(&self, stream_name: &str) -> Result { let stream_metadata = self.get_object(&stream_json_path(stream_name)).await?; let stream_metadata: Value = @@ -230,7 +331,7 @@ pub trait ObjectStorage: Sync + 'static { .get("retention") .cloned(); if let Some(retention) = retention { - Ok(serde_json::from_value(retention).unwrap()) + Ok(serde_json::from_value(retention)?) } else { Ok(Retention::default()) } @@ -263,6 +364,7 @@ pub trait ObjectStorage: Sync + 'static { } } + // get the manifest info async fn get_manifest( &self, path: &RelativePath, @@ -291,12 +393,14 @@ pub trait ObjectStorage: Sync + 'static { self.put_object(&path, to_bytes(&manifest)).await } - async fn get_snapshot(&self, stream: &str) -> Result { + // gets the snapshot of the stream + async fn get_object_store_format( + &self, + stream: &str, + ) -> Result { let path = stream_json_path(stream); let bytes = self.get_object(&path).await?; - Ok(serde_json::from_slice::(&bytes) - .expect("snapshot is valid json") - .snapshot) + Ok(serde_json::from_slice::(&bytes).expect("snapshot is valid json")) } async fn put_snapshot( @@ -325,12 +429,20 @@ pub trait ObjectStorage: Sync + 'static { let cache_enabled = STREAM_INFO .cache_enabled(stream) .map_err(|err| ObjectStorageError::UnhandledError(Box::new(err)))?; + let time_partition = STREAM_INFO + .get_time_partition(stream) + .map_err(|err| ObjectStorageError::UnhandledError(Box::new(err)))?; let dir = StorageDir::new(stream); - let schema = convert_disk_files_to_parquet(stream, &dir) + let schema = convert_disk_files_to_parquet(stream, &dir, time_partition) .map_err(|err| ObjectStorageError::UnhandledError(Box::new(err)))?; if let Some(schema) = schema { - commit_schema_to_storage(stream, schema).await?; + let static_schema_flag = STREAM_INFO + .get_static_schema_flag(stream) + .map_err(|err| ObjectStorageError::UnhandledError(Box::new(err)))?; + if static_schema_flag.is_none() { + commit_schema_to_storage(stream, schema).await?; + } } let parquet_files = dir.parquet_files(); @@ -362,7 +474,7 @@ pub trait ObjectStorage: Sync + 'static { cache_updates .entry(stream) .or_default() - .push((absolute_path, file)) + .push((absolute_path, file)); } else { let _ = fs::remove_file(file); } @@ -401,9 +513,12 @@ pub trait ObjectStorage: Sync + 'static { Ok(()) } + + // pick a better name + fn get_bucket_name(&self) -> String; } -async fn commit_schema_to_storage( +pub async fn commit_schema_to_storage( stream_name: &str, schema: Schema, ) -> Result<(), ObjectStorageError> { @@ -422,25 +537,78 @@ fn to_bytes(any: &(impl ?Sized + serde::Serialize)) -> Bytes { #[inline(always)] fn schema_path(stream_name: &str) -> RelativePathBuf { - RelativePathBuf::from_iter([stream_name, SCHEMA_FILE_NAME]) + match CONFIG.parseable.mode { + Mode::Ingest => { + let file_name = format!( + ".ingestor.{}{}", + INGESTOR_META.ingestor_id.clone(), + SCHEMA_FILE_NAME + ); + + RelativePathBuf::from_iter([stream_name, STREAM_ROOT_DIRECTORY, &file_name]) + } + Mode::All | Mode::Query => { + RelativePathBuf::from_iter([stream_name, STREAM_ROOT_DIRECTORY, SCHEMA_FILE_NAME]) + } + } } #[inline(always)] -fn stream_json_path(stream_name: &str) -> RelativePathBuf { - RelativePathBuf::from_iter([stream_name, STREAM_METADATA_FILE_NAME]) +pub fn stream_json_path(stream_name: &str) -> RelativePathBuf { + match &CONFIG.parseable.mode { + Mode::Ingest => { + let file_name = format!( + ".ingestor.{}{}", + INGESTOR_META.get_ingestor_id(), + STREAM_METADATA_FILE_NAME + ); + RelativePathBuf::from_iter([stream_name, STREAM_ROOT_DIRECTORY, &file_name]) + } + Mode::Query | Mode::All => RelativePathBuf::from_iter([ + stream_name, + STREAM_ROOT_DIRECTORY, + STREAM_METADATA_FILE_NAME, + ]), + } } +/// path will be ".parseable/.parsable.json" #[inline(always)] -fn parseable_json_path() -> RelativePathBuf { - RelativePathBuf::from(PARSEABLE_METADATA_FILE_NAME) +pub fn parseable_json_path() -> RelativePathBuf { + RelativePathBuf::from_iter([PARSEABLE_ROOT_DIRECTORY, PARSEABLE_METADATA_FILE_NAME]) } +/// TODO: Needs to be updated for distributed mode #[inline(always)] fn alert_json_path(stream_name: &str) -> RelativePathBuf { RelativePathBuf::from_iter([stream_name, ALERT_FILE_NAME]) } #[inline(always)] -fn manifest_path(prefix: &str) -> RelativePathBuf { - RelativePathBuf::from_iter([prefix, MANIFEST_FILE]) +pub fn manifest_path(prefix: &str) -> RelativePathBuf { + if CONFIG.parseable.mode == Mode::Ingest { + let manifest_file_name = format!( + "ingestor.{}.{}", + INGESTOR_META.get_ingestor_id(), + MANIFEST_FILE + ); + RelativePathBuf::from_iter([prefix, &manifest_file_name]) + } else { + RelativePathBuf::from_iter([prefix, MANIFEST_FILE]) + } +} + +#[inline(always)] +pub fn ingestor_metadata_path(id: Option<&str>) -> RelativePathBuf { + if let Some(id) = id { + return RelativePathBuf::from_iter([ + PARSEABLE_ROOT_DIRECTORY, + &format!("ingestor.{}.json", id), + ]); + } + + RelativePathBuf::from_iter([ + PARSEABLE_ROOT_DIRECTORY, + &format!("ingestor.{}.json", INGESTOR_META.get_ingestor_id()), + ]) } diff --git a/server/src/storage/retention.rs b/server/src/storage/retention.rs index c395c7d84..f67a15eb7 100644 --- a/server/src/storage/retention.rs +++ b/server/src/storage/retention.rs @@ -43,42 +43,55 @@ fn async_runtime() -> tokio::runtime::Runtime { .unwrap() } -pub async fn load_retention_from_global() { +pub fn load_retention_from_global() { log::info!("loading retention for all streams"); - for stream in STREAM_INFO.list_streams() { - let res = CONFIG - .storage() - .get_object_store() - .get_retention(&stream) - .await; - match res { - Ok(config) => { - if config.tasks.is_empty() { - log::info!("skipping loading retention for {stream}"); - continue; - } - init_scheduler(&stream, config) - } - Err(err) => log::warn!("failed to load retention config for {stream} due to {err:?}"), - } - } + init_scheduler(); } -pub fn init_scheduler(stream: &str, config: Retention) { - log::info!("Setting up schedular for {stream}"); +pub fn init_scheduler() { + log::info!("Setting up schedular"); let mut scheduler = AsyncScheduler::new(); - for Task { action, days, .. } in config.tasks.into_iter() { - let func = match action { - Action::Delete => { - let stream = stream.to_string(); - move || action::delete(stream.clone(), u32::from(days)) - } - }; + let func = move || async { + for stream in STREAM_INFO.list_streams() { + let res = CONFIG + .storage() + .get_object_store() + .get_retention(&stream) + .await; + + match res { + Ok(config) => { + for Task { action, days, .. } in config.tasks.into_iter() { + match action { + Action::Delete => { + let stream = stream.to_string(); + thread::spawn(move || { + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + // Run the asynchronous delete action + action::delete(stream.clone(), u32::from(days)).await; + }); + }); + } + }; + } + } + Err(err) => { + log::warn!("failed to load retention config for {stream} due to {err:?}") + } + }; + } + }; - scheduler.every(1.day()).at("00:00").run(func); - } + // Execute once on startup + thread::spawn(move || { + let rt = async_runtime(); + rt.block_on(func()); + }); + + scheduler.every(1.day()).at("00:00").run(func); - let handler = thread::spawn(|| { + let scheduler_handler = thread::spawn(|| { let rt = async_runtime(); rt.block_on(async move { loop { @@ -88,7 +101,7 @@ pub fn init_scheduler(stream: &str, config: Retention) { }); }); - *SCHEDULER_HANDLER.lock().unwrap() = Some(handler); + *SCHEDULER_HANDLER.lock().unwrap() = Some(scheduler_handler); log::info!("Scheduler is initialized") } @@ -180,13 +193,12 @@ mod action { use itertools::Itertools; use relative_path::RelativePathBuf; - use crate::option::CONFIG; + use crate::{catalog::remove_manifest_from_snapshot, metadata, option::CONFIG}; pub(super) async fn delete(stream_name: String, days: u32) { - log::info!("running retention task - delete"); + log::info!("running retention task - delete for stream={stream_name}"); let retain_until = get_retain_until(Utc::now().date_naive(), days as u64); - - let Ok(dates) = CONFIG + let Ok(mut dates) = CONFIG .storage() .get_object_store() .list_dates(&stream_name) @@ -194,12 +206,12 @@ mod action { else { return; }; - + dates.retain(|date| date.starts_with("date")); let dates_to_delete = dates .into_iter() .filter(|date| string_to_date(date) < retain_until) .collect_vec(); - + let dates = dates_to_delete.clone(); let delete_tasks = FuturesUnordered::new(); for date in dates_to_delete { let path = RelativePathBuf::from_iter([&stream_name, &date]); @@ -219,6 +231,18 @@ mod action { log::error!("Failed to run delete task {err:?}") } } + + let store = CONFIG.storage().get_object_store(); + let res = remove_manifest_from_snapshot(store.clone(), &stream_name, dates).await; + if let Ok(first_event_at) = res { + if let Err(err) = metadata::STREAM_INFO.set_first_event_at(&stream_name, first_event_at) + { + log::error!( + "Failed to update first_event_at in streaminfo for stream {:?} {err:?}", + stream_name + ); + } + } } fn get_retain_until(current_date: NaiveDate, days: u64) -> NaiveDate { diff --git a/server/src/storage/s3.rs b/server/src/storage/s3.rs index af171bfae..ddeeea786 100644 --- a/server/src/storage/s3.rs +++ b/server/src/storage/s3.rs @@ -29,7 +29,7 @@ use object_store::aws::{AmazonS3, AmazonS3Builder, AmazonS3ConfigKey, Checksum}; use object_store::limit::LimitStore; use object_store::path::Path as StorePath; use object_store::{ClientOptions, ObjectStore}; -use relative_path::RelativePath; +use relative_path::{RelativePath, RelativePathBuf}; use tokio::fs::OpenOptions; use tokio::io::{AsyncReadExt, AsyncWriteExt}; @@ -39,10 +39,13 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use crate::metrics::storage::{s3::REQUEST_RESPONSE_TIME, StorageMetrics}; -use crate::storage::{LogStream, ObjectStorage, ObjectStorageError}; +use crate::storage::{LogStream, ObjectStorage, ObjectStorageError, PARSEABLE_ROOT_DIRECTORY}; use super::metrics_layer::MetricLayer; -use super::{object_storage, ObjectStorageProvider}; +use super::object_storage::parseable_json_path; +use super::{ + ObjectStorageProvider, SCHEMA_FILE_NAME, STREAM_METADATA_FILE_NAME, STREAM_ROOT_DIRECTORY, +}; // in bytes const MULTIPART_UPLOAD_SIZE: usize = 1024 * 1024 * 100; @@ -197,6 +200,7 @@ impl ObjectStorageProvider for S3Config { Arc::new(S3 { client: s3, bucket: self.bucket_name.clone(), + root: StorePath::from(""), }) } @@ -209,20 +213,21 @@ impl ObjectStorageProvider for S3Config { } } -fn to_path(path: &RelativePath) -> StorePath { +fn to_object_store_path(path: &RelativePath) -> StorePath { StorePath::from(path.as_str()) } pub struct S3 { client: LimitStore, bucket: String, + root: StorePath, } impl S3 { async fn _get_object(&self, path: &RelativePath) -> Result { let instant = Instant::now(); - let resp = self.client.get(&to_path(path)).await; + let resp = self.client.get(&to_object_store_path(path)).await; match resp { Ok(resp) => { @@ -249,7 +254,7 @@ impl S3 { resource: Bytes, ) -> Result<(), ObjectStorageError> { let time = Instant::now(); - let resp = self.client.put(&to_path(path), resource).await; + let resp = self.client.put(&to_object_store_path(path), resource).await; let status = if resp.is_ok() { "200" } else { "400" }; let time = time.elapsed().as_secs_f64(); REQUEST_RESPONSE_TIME @@ -292,19 +297,23 @@ impl S3 { async fn _list_streams(&self) -> Result, ObjectStorageError> { let resp = self.client.list_with_delimiter(None).await?; - let common_prefixes = resp.common_prefixes; + let common_prefixes = resp.common_prefixes; // get all dirs // return prefixes at the root level let dirs: Vec<_> = common_prefixes .iter() .filter_map(|path| path.parts().next()) .map(|name| name.as_ref().to_string()) + .filter(|x| x != PARSEABLE_ROOT_DIRECTORY) .collect(); let stream_json_check = FuturesUnordered::new(); for dir in &dirs { - let key = format!("{}/{}", dir, object_storage::STREAM_METADATA_FILE_NAME); + let key = format!( + "{}/{}/{}", + dir, STREAM_ROOT_DIRECTORY, STREAM_METADATA_FILE_NAME + ); let task = async move { self.client.head(&StorePath::from(key)).await.map(|_| ()) }; stream_json_check.push(task); } @@ -403,6 +412,102 @@ impl ObjectStorage for S3 { Ok(self._get_object(path).await?) } + async fn get_objects( + &self, + base_path: Option<&RelativePath>, + filter_func: Box bool + Send>, + ) -> Result, ObjectStorageError> { + let instant = Instant::now(); + + let prefix = if let Some(base_path) = base_path { + to_object_store_path(base_path) + } else { + self.root.clone() + }; + + let mut list_stream = self.client.list(Some(&prefix)).await?; + + let mut res = vec![]; + + while let Some(meta) = list_stream.next().await.transpose()? { + let ingestor_file = filter_func(meta.location.filename().unwrap().to_string()); + + if !ingestor_file { + continue; + } + + let byts = self + .get_object( + RelativePath::from_path(meta.location.as_ref()) + .map_err(ObjectStorageError::PathError)?, + ) + .await?; + + res.push(byts); + } + + let instant = instant.elapsed().as_secs_f64(); + REQUEST_RESPONSE_TIME + .with_label_values(&["GET", "200"]) + .observe(instant); + + Ok(res) + } + + async fn get_ingestor_meta_file_paths( + &self, + ) -> Result, ObjectStorageError> { + let time = Instant::now(); + let mut path_arr = vec![]; + let mut object_stream = self.client.list(Some(&self.root)).await?; + + while let Some(meta) = object_stream.next().await.transpose()? { + let flag = meta.location.filename().unwrap().starts_with("ingestor"); + + if flag { + path_arr.push(RelativePathBuf::from(meta.location.as_ref())); + } + } + + let time = time.elapsed().as_secs_f64(); + REQUEST_RESPONSE_TIME + .with_label_values(&["GET", "200"]) + .observe(time); + + Ok(path_arr) + } + + async fn get_stream_file_paths( + &self, + stream_name: &str, + ) -> Result, ObjectStorageError> { + let time = Instant::now(); + let mut path_arr = vec![]; + let path = to_object_store_path(&RelativePathBuf::from(stream_name)); + let mut object_stream = self.client.list(Some(&path)).await?; + + while let Some(meta) = object_stream.next().await.transpose()? { + let flag = meta.location.filename().unwrap().starts_with(".ingestor"); + + if flag { + path_arr.push(RelativePathBuf::from(meta.location.as_ref())); + } + } + + path_arr.push(RelativePathBuf::from_iter([ + stream_name, + STREAM_METADATA_FILE_NAME, + ])); + path_arr.push(RelativePathBuf::from_iter([stream_name, SCHEMA_FILE_NAME])); + + let time = time.elapsed().as_secs_f64(); + REQUEST_RESPONSE_TIME + .with_label_values(&["GET", "200"]) + .observe(time); + + Ok(path_arr) + } + async fn put_object( &self, path: &RelativePath, @@ -421,10 +526,14 @@ impl ObjectStorage for S3 { Ok(()) } + async fn delete_object(&self, path: &RelativePath) -> Result<(), ObjectStorageError> { + Ok(self.client.delete(&to_object_store_path(path)).await?) + } + async fn check(&self) -> Result<(), ObjectStorageError> { Ok(self .client - .head(&object_storage::PARSEABLE_METADATA_FILE_NAME.into()) + .head(&to_object_store_path(&parseable_json_path())) .await .map(|_| ())?) } @@ -435,12 +544,59 @@ impl ObjectStorage for S3 { Ok(()) } + async fn try_delete_ingestor_meta( + &self, + ingestor_filename: String, + ) -> Result<(), ObjectStorageError> { + let file = RelativePathBuf::from(&ingestor_filename); + match self.client.delete(&to_object_store_path(&file)).await { + Ok(_) => Ok(()), + Err(err) => { + // if the object is not found, it is not an error + // the given url path was incorrect + if matches!(err, object_store::Error::NotFound { .. }) { + log::error!("Node does not exist"); + Err(err.into()) + } else { + log::error!("Error deleting ingestor meta file: {:?}", err); + Err(err.into()) + } + } + } + } + async fn list_streams(&self) -> Result, ObjectStorageError> { let streams = self._list_streams().await?; Ok(streams) } + async fn list_old_streams(&self) -> Result, ObjectStorageError> { + let resp = self.client.list_with_delimiter(None).await?; + + let common_prefixes = resp.common_prefixes; // get all dirs + + // return prefixes at the root level + let dirs: Vec<_> = common_prefixes + .iter() + .filter_map(|path| path.parts().next()) + .map(|name| name.as_ref().to_string()) + .filter(|x| x != PARSEABLE_ROOT_DIRECTORY) + .collect(); + + let stream_json_check = FuturesUnordered::new(); + + for dir in &dirs { + let key = format!("{}/{}", dir, STREAM_METADATA_FILE_NAME); + let task = async move { self.client.head(&StorePath::from(key)).await.map(|_| ()) }; + stream_json_check.push(task); + } + + stream_json_check.try_collect().await?; + + Ok(dirs.into_iter().map(|name| LogStream { name }).collect()) + } + async fn list_dates(&self, stream_name: &str) -> Result, ObjectStorageError> { let streams = self._list_dates(stream_name).await?; @@ -470,6 +626,22 @@ impl ObjectStorage for S3 { fn store_url(&self) -> url::Url { url::Url::parse(&format!("s3://{}", self.bucket)).unwrap() } + + async fn list_dirs(&self) -> Result, ObjectStorageError> { + let pre = object_store::path::Path::from("/"); + let resp = self.client.list_with_delimiter(Some(&pre)).await?; + + Ok(resp + .common_prefixes + .iter() + .flat_map(|path| path.parts()) + .map(|name| name.as_ref().to_string()) + .collect::>()) + } + + fn get_bucket_name(&self) -> String { + self.bucket.clone() + } } impl From for ObjectStorageError { diff --git a/server/src/storage/staging.rs b/server/src/storage/staging.rs index 6ee908079..512a9c4c0 100644 --- a/server/src/storage/staging.rs +++ b/server/src/storage/staging.rs @@ -25,7 +25,19 @@ use std::{ sync::Arc, }; +use crate::{ + event::DEFAULT_TIMESTAMP_KEY, + handlers::http::modal::{ingest_server::INGESTOR_META, IngestorMetadata, DEFAULT_VERSION}, + metrics, + option::{Mode, CONFIG}, + storage::OBJECT_STORE_DATA_GRANULARITY, + utils::{ + self, arrow::merged_reader::MergedReverseRecordReader, get_ingestor_id, get_url, + hostname_unchecked, + }, +}; use arrow_schema::{ArrowError, Schema}; +use base64::Engine; use chrono::{NaiveDateTime, Timelike, Utc}; use parquet::{ arrow::ArrowWriter, @@ -36,14 +48,6 @@ use parquet::{ schema::types::ColumnPath, }; -use crate::{ - event::DEFAULT_TIMESTAMP_KEY, - metrics, - option::CONFIG, - storage::OBJECT_STORE_DATA_GRANULARITY, - utils::{self, arrow::merged_reader::MergedReverseRecordReader}, -}; - const ARROW_FILE_EXTENSION: &str = "data.arrows"; const PARQUET_FILE_EXTENSION: &str = "data.parquet"; @@ -64,8 +68,13 @@ impl StorageDir { + &utils::hour_to_prefix(time.hour()) + &utils::minute_to_prefix(time.minute(), OBJECT_STORE_DATA_GRANULARITY).unwrap(); let local_uri = str::replace(&uri, "/", "."); - let hostname = utils::hostname_unchecked(); - format!("{local_uri}{hostname}.{extention}") + let hostname = hostname_unchecked(); + if CONFIG.parseable.mode == Mode::Ingest { + let id = INGESTOR_META.get_ingestor_id(); + format!("{local_uri}{hostname}{id}.{extention}") + } else { + format!("{local_uri}{hostname}.{extention}") + } } fn filename_by_time(stream_hash: &str, time: NaiveDateTime) -> String { @@ -124,6 +133,7 @@ impl StorageDir { // hashmap but exclude where hot filename matches let mut grouped_arrow_file: HashMap> = HashMap::new(); let mut arrow_files = self.arrow_files(); + arrow_files.retain(|path| { !path .file_name() @@ -132,6 +142,7 @@ impl StorageDir { .unwrap() .ends_with(&hot_filename) }); + for arrow_file_path in arrow_files { let key = Self::arrow_path_to_parquet(&arrow_file_path); grouped_arrow_file @@ -157,6 +168,17 @@ impl StorageDir { fn arrow_path_to_parquet(path: &Path) -> PathBuf { let filename = path.file_name().unwrap().to_str().unwrap(); let (_, filename) = filename.split_once('.').unwrap(); + let filename = filename.rsplit_once('.').expect("contains the delim `.`"); + let filename = format!("{}.{}", filename.0, filename.1); + + /* + let file_stem = path.file_stem().unwrap().to_str().unwrap(); + let random_string = + rand::distributions::Alphanumeric.sample_string(&mut rand::thread_rng(), 20); + let (_, filename) = file_stem.split_once('.').unwrap(); + let filename_with_random_number = format!("{}.{}.{}", filename, random_number, "arrows"); + */ + let mut parquet_path = path.to_owned(); parquet_path.set_file_name(filename); parquet_path.set_extension("parquet"); @@ -175,6 +197,7 @@ pub fn to_parquet_path(stream_name: &str, time: NaiveDateTime) -> PathBuf { pub fn convert_disk_files_to_parquet( stream: &str, dir: &StorageDir, + time_partition: Option, ) -> Result, MoveDataError> { let mut schemas = Vec::new(); @@ -182,6 +205,12 @@ pub fn convert_disk_files_to_parquet( let staging_files = dir.arrow_files_grouped_exclude_time(time); if staging_files.is_empty() { metrics::STAGING_FILES.with_label_values(&[stream]).set(0); + metrics::STORAGE_SIZE + .with_label_values(&["staging", stream, "arrows"]) + .set(0); + metrics::STORAGE_SIZE + .with_label_values(&["staging", stream, "parquet"]) + .set(0); } for (parquet_path, files) in staging_files { @@ -199,11 +228,14 @@ pub fn convert_disk_files_to_parquet( } let record_reader = MergedReverseRecordReader::try_new(&files).unwrap(); - + let merged_schema = record_reader.merged_schema(); + let mut index_time_partition: usize = 0; + if let Some(time_partition) = time_partition.as_ref() { + index_time_partition = merged_schema.index_of(time_partition).unwrap(); + } let parquet_file = fs::File::create(&parquet_path).map_err(|_| MoveDataError::Create)?; + let props = parquet_writer_props(time_partition.clone(), index_time_partition).build(); - let props = parquet_writer_props().build(); - let merged_schema = record_reader.merged_schema(); schemas.push(merged_schema.clone()); let schema = Arc::new(merged_schema); let mut writer = ArrowWriter::try_new(parquet_file, schema.clone(), Some(props))?; @@ -229,19 +261,131 @@ pub fn convert_disk_files_to_parquet( } } -fn parquet_writer_props() -> WriterPropertiesBuilder { - WriterProperties::builder() - .set_max_row_group_size(CONFIG.parseable.row_group_size) - .set_compression(CONFIG.parseable.parquet_compression.into()) - .set_column_encoding( - ColumnPath::new(vec![DEFAULT_TIMESTAMP_KEY.to_string()]), - Encoding::DELTA_BINARY_PACKED, - ) - .set_sorting_columns(Some(vec![SortingColumn { - column_idx: 0, - descending: true, - nulls_first: true, - }])) +fn parquet_writer_props( + time_partition: Option, + index_time_partition: usize, +) -> WriterPropertiesBuilder { + let index_time_partition: i32 = index_time_partition as i32; + + if let Some(time_partition) = time_partition { + WriterProperties::builder() + .set_max_row_group_size(CONFIG.parseable.row_group_size) + .set_compression(CONFIG.parseable.parquet_compression.into()) + .set_column_encoding( + ColumnPath::new(vec![time_partition]), + Encoding::DELTA_BYTE_ARRAY, + ) + .set_sorting_columns(Some(vec![SortingColumn { + column_idx: index_time_partition, + descending: true, + nulls_first: true, + }])) + } else { + WriterProperties::builder() + .set_max_row_group_size(CONFIG.parseable.row_group_size) + .set_compression(CONFIG.parseable.parquet_compression.into()) + .set_column_encoding( + ColumnPath::new(vec![DEFAULT_TIMESTAMP_KEY.to_string()]), + Encoding::DELTA_BINARY_PACKED, + ) + .set_sorting_columns(Some(vec![SortingColumn { + column_idx: index_time_partition, + descending: true, + nulls_first: true, + }])) + } +} + +pub fn get_ingestor_info() -> anyhow::Result { + let path = PathBuf::from(&CONFIG.parseable.local_staging_path); + + // all the files should be in the staging directory root + let entries = std::fs::read_dir(path)?; + let url = get_url(); + let port = url.port().expect("here port should be defined").to_string(); + let url = url.to_string(); + + for entry in entries { + // cause the staging directory will have only one file with ingestor in the name + // so the JSON Parse should not error unless the file is corrupted + let path = entry?.path(); + let flag = path + .file_name() + .unwrap_or_default() + .to_str() + .unwrap_or_default() + .contains("ingestor"); + + if flag { + // get the ingestor metadata from staging + let mut meta: IngestorMetadata = serde_json::from_slice(&std::fs::read(path)?)?; + + // compare url endpoint and port + if meta.domain_name != url { + log::info!( + "Domain Name was Updated. Old: {} New: {}", + meta.domain_name, + url + ); + meta.domain_name = url; + } + + if meta.port != port { + log::info!("Port was Updated. Old: {} New: {}", meta.port, port); + meta.port = port; + } + + let token = base64::prelude::BASE64_STANDARD.encode(format!( + "{}:{}", + CONFIG.parseable.username, CONFIG.parseable.password + )); + + let token = format!("Basic {}", token); + + if meta.token != token { + // TODO: Update the message to be more informative with username and password + log::info!( + "Credentials were Updated. Old: {} New: {}", + meta.token, + token + ); + meta.token = token; + } + + put_ingestor_info(meta.clone())?; + return Ok(meta); + } + } + + let store = CONFIG.storage().get_object_store(); + let out = IngestorMetadata::new( + port, + url, + DEFAULT_VERSION.to_string(), + store.get_bucket_name(), + &CONFIG.parseable.username, + &CONFIG.parseable.password, + get_ingestor_id(), + ); + + put_ingestor_info(out.clone())?; + Ok(out) +} + +/// Puts the ingestor info into the staging. +/// +/// This function takes the ingestor info as a parameter and stores it in staging. +/// # Parameters +/// +/// * `ingestor_info`: The ingestor info to be stored. +fn put_ingestor_info(info: IngestorMetadata) -> anyhow::Result<()> { + let path = PathBuf::from(&CONFIG.parseable.local_staging_path); + let file_name = format!("ingestor.{}.json", info.ingestor_id); + let file_path = path.join(file_name); + + std::fs::write(file_path, serde_json::to_string(&info)?)?; + + Ok(()) } #[derive(Debug, thiserror::Error)] diff --git a/server/src/storage/store_metadata.rs b/server/src/storage/store_metadata.rs index 0e9ad955f..44ae55868 100644 --- a/server/src/storage/store_metadata.rs +++ b/server/src/storage/store_metadata.rs @@ -26,13 +26,14 @@ use once_cell::sync::OnceCell; use std::io; use crate::{ - option::CONFIG, + metadata::error::stream_info::MetadataError, + option::{Mode, CONFIG, JOIN_COMMUNITY}, rbac::{role::model::DefaultPrivilege, user::User}, storage::ObjectStorageError, utils::uid, }; -use super::object_storage::PARSEABLE_METADATA_FILE_NAME; +use super::{object_storage::parseable_json_path, PARSEABLE_METADATA_FILE_NAME}; // Expose some static variables for internal usage pub static STORAGE_METADATA: OnceCell = OnceCell::new(); @@ -55,6 +56,7 @@ pub struct StorageMetadata { pub deployment_id: uid::Uid, pub users: Vec, pub streams: Vec, + pub server_mode: String, #[serde(default)] pub roles: HashMap>, #[serde(default)] @@ -69,6 +71,7 @@ impl StorageMetadata { staging: CONFIG.staging_dir().to_path_buf(), storage: CONFIG.storage().get_endpoint(), deployment_id: uid::gen(), + server_mode: CONFIG.parseable.mode.to_string(), users: Vec::new(), streams: Vec::new(), roles: HashMap::default(), @@ -92,25 +95,16 @@ impl StorageMetadata { } } -// always returns remote metadata as it is source of truth -// overwrites staging metadata while updating storage info +/// deals with the staging directory creation and metadata resolution +/// always returns remote metadata as it is source of truth +/// overwrites staging metadata while updating storage info pub async fn resolve_parseable_metadata() -> Result { let staging_metadata = get_staging_metadata()?; let storage = CONFIG.storage().get_object_store(); let remote_metadata = storage.get_metadata().await?; - let check = match (staging_metadata, remote_metadata) { - (Some(staging), Some(remote)) => { - if staging.deployment_id == remote.deployment_id { - EnvChange::None(remote) - } else { - EnvChange::NewRemote - } - } - (None, Some(remote)) => EnvChange::NewStaging(remote), - (Some(_), None) => EnvChange::NewRemote, - (None, None) => EnvChange::CreateBoth, - }; + // Env Change needs to be updated + let check = determine_environment(staging_metadata, remote_metadata); // flags for if metadata needs to be synced let mut overwrite_staging = false; @@ -120,36 +114,69 @@ pub async fn resolve_parseable_metadata() -> Result { // overwrite staging anyways so that it matches remote in case of any divergence overwrite_staging = true; + if CONFIG.parseable.mode == Mode::All { + standalone_after_distributed(Mode::from_string(&metadata.server_mode).expect("mode should be valid at here")) + .map_err(|err| { + ObjectStorageError::Custom(err.to_string()) + })?; + } Ok(metadata) }, EnvChange::NewRemote => { Err("Could not start the server because staging directory indicates stale data from previous deployment, please choose an empty staging directory and restart the server") } EnvChange::NewStaging(mut metadata) => { - create_dir_all(CONFIG.staging_dir())?; - metadata.staging = CONFIG.staging_dir().canonicalize()?; - // this flag is set to true so that metadata is copied to staging - overwrite_staging = true; - // overwrite remote because staging dir has changed. - overwrite_remote = true; - Ok(metadata) + // if server is started in ingest mode,we need to make sure that query mode has been started + // i.e the metadata is updated to reflect the server mode = Query + if Mode::from_string(&metadata.server_mode).map_err(ObjectStorageError::Custom)? == Mode::All && CONFIG.parseable.mode == Mode::Ingest { + Err("Starting Ingest Mode is not allowed, Since Query Server has not been started yet") + } else { + create_dir_all(CONFIG.staging_dir())?; + metadata.staging = CONFIG.staging_dir().canonicalize()?; + // this flag is set to true so that metadata is copied to staging + overwrite_staging = true; + // overwrite remote in all and query mode + // because staging dir has changed. + match CONFIG.parseable.mode { + Mode::All => { + standalone_after_distributed(Mode::from_string(&metadata.server_mode).expect("mode should be valid at here")) + .map_err(|err| { + ObjectStorageError::Custom(err.to_string()) + })?; + overwrite_remote = true; + }, + Mode::Query => { + overwrite_remote = true; + metadata.server_mode = CONFIG.parseable.mode.to_string(); + metadata.staging = CONFIG.staging_dir().to_path_buf(); + }, + Mode::Ingest => { + // if ingest server is started fetch the metadata from remote + // update the server mode for local metadata + metadata.server_mode = CONFIG.parseable.mode.to_string(); + metadata.staging = CONFIG.staging_dir().to_path_buf(); + }, + } + Ok(metadata) + } } EnvChange::CreateBoth => { create_dir_all(CONFIG.staging_dir())?; let metadata = StorageMetadata::new(); - // new metadata needs to be set on both staging and remote - overwrite_remote = true; + // new metadata needs to be set + // if mode is query or all then both staging and remote + match CONFIG.parseable.mode { + Mode::All | Mode::Query => overwrite_remote = true, + _ => (), + } + // else only staging overwrite_staging = true; Ok(metadata) } }; let metadata = res.map_err(|err| { - let err = format!( - "{}. {}", - err, - "Join us on Parseable Slack to report this incident : https://launchpass.com/parseable" - ); + let err = format!("{}. {}", err, JOIN_COMMUNITY); let err: Box = err.into(); ObjectStorageError::UnhandledError(err) })?; @@ -165,10 +192,37 @@ pub async fn resolve_parseable_metadata() -> Result, + remote_metadata: Option, +) -> EnvChange { + match (staging_metadata, remote_metadata) { + (Some(staging), Some(remote)) => { + // if both staging and remote have same deployment id + if staging.deployment_id == remote.deployment_id { + EnvChange::None(remote) + } else if Mode::from_string(&remote.server_mode).expect("server mode is valid here") + == Mode::All + && (CONFIG.parseable.mode == Mode::Query || CONFIG.parseable.mode == Mode::Ingest) + { + // if you are switching to distributed mode from standalone mode + // it will create a new staging rather than a new remote + EnvChange::NewStaging(remote) + } else { + // it is a new remote + EnvChange::NewRemote + } + } + (None, Some(remote)) => EnvChange::NewStaging(remote), + (Some(_), None) => EnvChange::NewRemote, + (None, None) => EnvChange::CreateBoth, + } +} + // variant contain remote metadata #[derive(Debug, Clone, PartialEq, Eq)] pub enum EnvChange { - /// No change in env i.e both staging and remote have same id + /// No change in env i.e both staging and remote have same id /// or deployment id of staging is not matching with that of remote None(StorageMetadata), /// Metadata not found in storage. Treated as possible misconfiguration on user side. @@ -179,8 +233,18 @@ pub enum EnvChange { CreateBoth, } +fn standalone_after_distributed(remote_server_mode: Mode) -> Result<(), MetadataError> { + // standalone -> query | ingest allowed + // but query | ingest -> standalone not allowed + if remote_server_mode == Mode::Query { + return Err(MetadataError::StandaloneWithDistributed("Starting Standalone Mode is not permitted when Distributed Mode is enabled. Please restart the server with Distributed Mode enabled.".to_string())); + } + + Ok(()) +} + pub fn get_staging_metadata() -> io::Result> { - let path = CONFIG.staging_dir().join(PARSEABLE_METADATA_FILE_NAME); + let path = parseable_json_path().to_path(CONFIG.staging_dir()); let bytes = match fs::read(path) { Ok(bytes) => bytes, Err(err) => match err.kind() { diff --git a/server/src/sync.rs b/server/src/sync.rs new file mode 100644 index 000000000..b44dcde13 --- /dev/null +++ b/server/src/sync.rs @@ -0,0 +1,112 @@ +/* + * Parseable Server (C) 2022 - 2024 Parseable, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + * + */ + +use clokwerk::{AsyncScheduler, Job, Scheduler, TimeUnits}; +use thread_priority::{ThreadBuilder, ThreadPriority}; +use tokio::sync::oneshot; +use tokio::sync::oneshot::error::TryRecvError; + +use std::panic::{catch_unwind, AssertUnwindSafe}; +use std::thread::{self, JoinHandle}; +use std::time::Duration; + +use crate::option::CONFIG; +use crate::{storage, STORAGE_UPLOAD_INTERVAL}; + +pub fn object_store_sync() -> (JoinHandle<()>, oneshot::Receiver<()>, oneshot::Sender<()>) { + let (outbox_tx, outbox_rx) = oneshot::channel::<()>(); + let (inbox_tx, inbox_rx) = oneshot::channel::<()>(); + let mut inbox_rx = AssertUnwindSafe(inbox_rx); + let handle = thread::spawn(move || { + let res = catch_unwind(move || { + let rt = actix_web::rt::System::new(); + rt.block_on(async { + let mut scheduler = AsyncScheduler::new(); + scheduler + .every(STORAGE_UPLOAD_INTERVAL.seconds()) + // Extra time interval is added so that this schedular does not race with local sync. + .plus(5u32.seconds()) + .run(|| async { + if let Err(e) = CONFIG.storage().get_object_store().sync().await { + log::warn!("failed to sync local data with object store. {:?}", e); + } + }); + + loop { + tokio::time::sleep(Duration::from_secs(1)).await; + scheduler.run_pending().await; + match AssertUnwindSafe(|| inbox_rx.try_recv())() { + Ok(_) => break, + Err(TryRecvError::Empty) => continue, + Err(TryRecvError::Closed) => { + // should be unreachable but breaking anyways + break; + } + } + } + }) + }); + + if res.is_err() { + outbox_tx.send(()).unwrap(); + } + }); + + (handle, outbox_rx, inbox_tx) +} + +pub fn run_local_sync() -> (JoinHandle<()>, oneshot::Receiver<()>, oneshot::Sender<()>) { + let (outbox_tx, outbox_rx) = oneshot::channel::<()>(); + let (inbox_tx, inbox_rx) = oneshot::channel::<()>(); + let mut inbox_rx = AssertUnwindSafe(inbox_rx); + + let handle = ThreadBuilder::default() + .name("local-sync") + .priority(ThreadPriority::Max) + .spawn(move |priority_result| { + if priority_result.is_err() { + log::warn!("Max priority cannot be set for sync thread. Make sure that user/program is allowed to set thread priority.") + } + let res = catch_unwind(move || { + let mut scheduler = Scheduler::new(); + scheduler + .every((storage::LOCAL_SYNC_INTERVAL as u32).seconds()) + .run(move || crate::event::STREAM_WRITERS.unset_all()); + + loop { + thread::sleep(Duration::from_millis(50)); + scheduler.run_pending(); + match AssertUnwindSafe(|| inbox_rx.try_recv())() { + Ok(_) => break, + Err(TryRecvError::Empty) => continue, + Err(TryRecvError::Closed) => { + // should be unreachable but breaking anyways + break; + } + } + } + }); + + if res.is_err() { + outbox_tx.send(()).unwrap(); + } + }) + .unwrap(); + + (handle, outbox_rx, inbox_tx) +} diff --git a/server/src/utils.rs b/server/src/utils.rs index 58f0c3eee..ec60f115a 100644 --- a/server/src/utils.rs +++ b/server/src/utils.rs @@ -22,10 +22,12 @@ pub mod header_parsing; pub mod json; pub mod uid; pub mod update; - -use std::path::Path; - +use crate::option::CONFIG; use chrono::{DateTime, NaiveDate, Timelike, Utc}; +use sha2::{Digest, Sha256}; + +use std::env; +use url::Url; #[allow(dead_code)] pub fn hostname() -> Option { @@ -38,22 +40,6 @@ pub fn hostname_unchecked() -> String { hostname::get().unwrap().into_string().unwrap() } -#[allow(dead_code)] -pub fn capitalize_ascii(s: &str) -> String { - s[0..1].to_uppercase() + &s[1..] -} - -pub fn validate_path_is_writeable(path: &Path) -> anyhow::Result<()> { - let Ok(md) = std::fs::metadata(path) else { - anyhow::bail!("Could not read metadata for staging dir") - }; - let permissions = md.permissions(); - if permissions.readonly() { - anyhow::bail!("Staging directory {} is not writable", path.display()) - } - Ok(()) -} - /// Convert minutes to a slot range /// e.g. given minute = 15 and OBJECT_STORE_DATA_GRANULARITY = 10 returns "10-19" pub fn minute_to_slot(minute: u32, data_granularity: u32) -> Option { @@ -235,6 +221,59 @@ impl TimePeriod { } } +pub fn get_url() -> Url { + if CONFIG.parseable.ingestor_endpoint.is_empty() { + return format!( + "{}://{}", + CONFIG.parseable.get_scheme(), + CONFIG.parseable.address + ) + .parse::() // if the value was improperly set, this will panic before hand + .expect("Valid URL"); + } + let addr_from_env = CONFIG + .parseable + .ingestor_endpoint + .split(':') + .collect::>(); + + let mut hostname = addr_from_env[0].to_string(); + let mut port = addr_from_env[1].to_string(); + + // if the env var value fits the pattern $VAR_NAME:$VAR_NAME + // fetch the value from the specified env vars + if hostname.starts_with('$') { + let var_hostname = hostname[1..].to_string(); + hostname = get_from_env(&var_hostname); + } + if !hostname.starts_with("http") { + hostname = format!("{}://{}", CONFIG.parseable.get_scheme(), hostname); + } + + if port.starts_with('$') { + let var_port = port[1..].to_string(); + port = get_from_env(&var_port); + } + format!("{}:{}", hostname, port) + .parse::() + .expect("Valid URL") +} + +/// util fuction to fetch value from an env var +fn get_from_env(var_to_fetch: &str) -> String { + env::var(var_to_fetch).unwrap_or_else(|_| "".to_string()) +} + +pub fn get_ingestor_id() -> String { + let now = Utc::now().to_rfc3339().to_string(); + let mut hasher = Sha256::new(); + hasher.update(now); + let result = format!("{:x}", hasher.finalize()); + let result = result.split_at(15).0.to_string(); + log::debug!("Ingestor ID: {}", &result); + result.to_string() +} + #[cfg(test)] mod tests { use chrono::DateTime; @@ -263,7 +302,7 @@ mod tests { ] )] #[case::same_hour_with_00_to_59_minute_block( - "2022-06-11T16:00:00+00:00", "2022-06-11T16:59:59+00:00", + "2022-06-11T16:00:00+00:00", "2022-06-11T16:59:59+00:00", &["date=2022-06-11/hour=16/"] )] #[case::same_date_different_hours_coherent_minute( @@ -274,14 +313,14 @@ mod tests { ] )] #[case::same_date_different_hours_incoherent_minutes( - "2022-06-11T15:59:00+00:00", "2022-06-11T16:01:00+00:00", + "2022-06-11T15:59:00+00:00", "2022-06-11T16:01:00+00:00", &[ "date=2022-06-11/hour=15/minute=59/", "date=2022-06-11/hour=16/minute=00/" ] )] #[case::same_date_different_hours_whole_hours_between_incoherent_minutes( - "2022-06-11T15:59:00+00:00", "2022-06-11T17:01:00+00:00", + "2022-06-11T15:59:00+00:00", "2022-06-11T17:01:00+00:00", &[ "date=2022-06-11/hour=15/minute=59/", "date=2022-06-11/hour=16/", @@ -289,14 +328,14 @@ mod tests { ] )] #[case::different_date_coherent_hours_and_minutes( - "2022-06-11T00:00:00+00:00", "2022-06-13T00:00:00+00:00", + "2022-06-11T00:00:00+00:00", "2022-06-13T00:00:00+00:00", &[ "date=2022-06-11/", "date=2022-06-12/" ] )] #[case::different_date_incoherent_hours_coherent_minutes( - "2022-06-11T23:00:01+00:00", "2022-06-12T01:59:59+00:00", + "2022-06-11T23:00:01+00:00", "2022-06-12T01:59:59+00:00", &[ "date=2022-06-11/hour=23/", "date=2022-06-12/hour=00/", @@ -304,7 +343,7 @@ mod tests { ] )] #[case::different_date_incoherent_hours_incoherent_minutes( - "2022-06-11T23:59:59+00:00", "2022-06-12T00:01:00+00:00", + "2022-06-11T23:59:59+00:00", "2022-06-12T00:01:00+00:00", &[ "date=2022-06-11/hour=23/minute=59/", "date=2022-06-12/hour=00/minute=00/" diff --git a/server/src/utils/arrow/merged_reader.rs b/server/src/utils/arrow/merged_reader.rs index 8a31ae200..ef76ddf3f 100644 --- a/server/src/utils/arrow/merged_reader.rs +++ b/server/src/utils/arrow/merged_reader.rs @@ -17,12 +17,11 @@ * */ -use std::{fs::File, io::BufReader, path::PathBuf, sync::Arc}; - use arrow_array::{RecordBatch, TimestampMillisecondArray}; use arrow_ipc::reader::StreamReader; use arrow_schema::Schema; use itertools::kmerge_by; +use std::{fs::File, io::BufReader, path::PathBuf, sync::Arc}; use super::{ adapt_batch, diff --git a/server/src/utils/json.rs b/server/src/utils/json.rs index 0f18d4bf7..082b4e823 100644 --- a/server/src/utils/json.rs +++ b/server/src/utils/json.rs @@ -21,8 +21,11 @@ use serde_json::Value; pub mod flatten; -pub fn flatten_json_body(body: serde_json::Value) -> Result { - flatten::flatten(body, "_") +pub fn flatten_json_body( + body: serde_json::Value, + time_partition: Option, +) -> Result { + flatten::flatten(body, "_", time_partition) } pub fn convert_to_string(value: &Value) -> Value { diff --git a/server/src/utils/json/flatten.rs b/server/src/utils/json/flatten.rs index 4b7a21556..bc7a33a21 100644 --- a/server/src/utils/json/flatten.rs +++ b/server/src/utils/json/flatten.rs @@ -17,26 +17,45 @@ */ use anyhow::anyhow; +use chrono::{DateTime, Timelike, Utc}; use itertools::Itertools; use serde_json::map::Map; use serde_json::value::Value; -pub fn flatten(nested_value: Value, separator: &str) -> Result { +pub fn flatten( + nested_value: Value, + separator: &str, + time_partition: Option, +) -> Result { match nested_value { Value::Object(nested_dict) => { - let mut map = Map::new(); - flatten_object(&mut map, None, nested_dict, separator)?; - Ok(Value::Object(map)) + let validate_time_partition_result = + validate_time_partition(Value::Object(nested_dict.clone()), time_partition.clone()); + if validate_time_partition_result.is_ok() { + let mut map = Map::new(); + flatten_object(&mut map, None, nested_dict, separator)?; + Ok(Value::Object(map)) + } else { + Err(anyhow!(validate_time_partition_result.unwrap_err())) + } } Value::Array(mut arr) => { for _value in &mut arr { - let value = std::mem::replace(_value, Value::Null); - let mut map = Map::new(); - let Value::Object(obj) = value else { - return Err(anyhow!("Expected object in array of objects")); - }; - flatten_object(&mut map, None, obj, separator)?; - *_value = Value::Object(map); + let value: Value = _value.clone(); + let validate_time_partition_result = + validate_time_partition(value, time_partition.clone()); + + if validate_time_partition_result.is_ok() { + let value = std::mem::replace(_value, Value::Null); + let mut map = Map::new(); + let Value::Object(obj) = value else { + return Err(anyhow!("Expected object in array of objects")); + }; + flatten_object(&mut map, None, obj, separator)?; + *_value = Value::Object(map); + } else { + return Err(anyhow!(validate_time_partition_result.unwrap_err())); + } } Ok(Value::Array(arr)) } @@ -44,6 +63,58 @@ pub fn flatten(nested_value: Value, separator: &str) -> Result, +) -> Result { + if time_partition.is_none() { + Ok(true) + } else { + let body_timestamp = value.get(&time_partition.clone().unwrap().to_string()); + if body_timestamp.is_some() { + if body_timestamp + .unwrap() + .to_owned() + .as_str() + .unwrap() + .parse::>() + .is_ok() + { + let parsed_timestamp = body_timestamp + .unwrap() + .to_owned() + .as_str() + .unwrap() + .parse::>() + .unwrap() + .naive_utc(); + + if parsed_timestamp.date() == Utc::now().naive_utc().date() + && parsed_timestamp.hour() == Utc::now().naive_utc().hour() + && parsed_timestamp.minute() == Utc::now().naive_utc().minute() + { + Ok(true) + } else { + Err(anyhow!(format!( + "field {} and server time are not same", + time_partition.unwrap() + ))) + } + } else { + Err(anyhow!(format!( + "field {} is not in the correct datetime format", + time_partition.unwrap() + ))) + } + } else { + Err(anyhow!(format!( + "ingestion failed as field {} is not part of the log", + time_partition.unwrap() + ))) + } + } +} + pub fn flatten_with_parent_prefix( nested_value: Value, prefix: &str, @@ -158,19 +229,19 @@ mod tests { #[test] fn flatten_single_key_string() { let obj = json!({"key": "value"}); - assert_eq!(obj.clone(), flatten(obj, "_").unwrap()); + assert_eq!(obj.clone(), flatten(obj, "_", None).unwrap()); } #[test] fn flatten_single_key_int() { let obj = json!({"key": 1}); - assert_eq!(obj.clone(), flatten(obj, "_").unwrap()); + assert_eq!(obj.clone(), flatten(obj, "_", None).unwrap()); } #[test] fn flatten_multiple_key_value() { let obj = json!({"key1": 1, "key2": "value2"}); - assert_eq!(obj.clone(), flatten(obj, "_").unwrap()); + assert_eq!(obj.clone(), flatten(obj, "_", None).unwrap()); } #[test] @@ -178,7 +249,7 @@ mod tests { let obj = json!({"key": "value", "nested_key": {"key":"value"}}); assert_eq!( json!({"key": "value", "nested_key.key": "value"}), - flatten(obj, ".").unwrap() + flatten(obj, ".", None).unwrap() ); } @@ -187,7 +258,7 @@ mod tests { let obj = json!({"key": "value", "nested_key": {"key1":"value1", "key2": "value2"}}); assert_eq!( json!({"key": "value", "nested_key.key1": "value1", "nested_key.key2": "value2"}), - flatten(obj, ".").unwrap() + flatten(obj, ".", None).unwrap() ); } @@ -196,7 +267,7 @@ mod tests { let obj = json!({"key": "value", "nested_key": {"key1":[1,2,3]}}); assert_eq!( json!({"key": "value", "nested_key.key1": [1,2,3]}), - flatten(obj, ".").unwrap() + flatten(obj, ".", None).unwrap() ); } @@ -205,7 +276,7 @@ mod tests { let obj = json!({"key": [{"a": "value0"}, {"a": "value1"}]}); assert_eq!( json!({"key.a": ["value0", "value1"]}), - flatten(obj, ".").unwrap() + flatten(obj, ".", None).unwrap() ); } @@ -214,7 +285,7 @@ mod tests { let obj = json!({"key": [{"a": "value0"}, {"a": "value1", "b": "value1"}]}); assert_eq!( json!({"key.a": ["value0", "value1"], "key.b": [null, "value1"]}), - flatten(obj, ".").unwrap() + flatten(obj, ".", None).unwrap() ); } @@ -223,7 +294,7 @@ mod tests { let obj = json!({"key": [{"a": "value0", "b": "value0"}, {"a": "value1"}]}); assert_eq!( json!({"key.a": ["value0", "value1"], "key.b": ["value0", null]}), - flatten(obj, ".").unwrap() + flatten(obj, ".", None).unwrap() ); } @@ -232,7 +303,7 @@ mod tests { let obj = json!({"key": [{"a": {"p": 0}, "b": "value0"}, {"b": "value1"}]}); assert_eq!( json!({"key.a.p": [0, null], "key.b": ["value0", "value1"]}), - flatten(obj, ".").unwrap() + flatten(obj, ".", None).unwrap() ); } @@ -241,14 +312,14 @@ mod tests { let obj = json!({"key": [{"a": [{"p": "value0", "q": "value0"}, {"p": "value1", "q": null}], "b": "value0"}, {"b": "value1"}]}); assert_eq!( json!({"key.a.p": [["value0", "value1"], null], "key.a.q": [["value0", null], null], "key.b": ["value0", "value1"]}), - flatten(obj, ".").unwrap() + flatten(obj, ".", None).unwrap() ); } #[test] fn flatten_mixed_object() { let obj = json!({"a": 42, "arr": ["1", {"key": "2"}, {"key": {"nested": "3"}}]}); - assert!(flatten(obj, ".").is_err()); + assert!(flatten(obj, ".", None).is_err()); } #[test]