diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 00000000..ddce69b6 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,3 @@ +node_modules/ +dist/ +.astro/ diff --git a/docs/astro.config.mjs b/docs/astro.config.mjs new file mode 100644 index 00000000..5ffe816e --- /dev/null +++ b/docs/astro.config.mjs @@ -0,0 +1,63 @@ +// @ts-check +import { defineConfig } from "astro/config"; +import starlight from "@astrojs/starlight"; + +// https://astro.build/config +export default defineConfig({ + integrations: [ + starlight({ + title: "ODict", + description: + "The lightning-fast open-source dictionary file format for human languages", + social: [ + { + icon: "github", + label: "GitHub", + href: "https://github.com/TheOpenDictionary/odict", + }, + ], + editLink: { + baseUrl: + "https://github.com/TheOpenDictionary/odict/edit/main/docs/", + }, + sidebar: [ + { + label: "Getting Started", + items: [ + { label: "Introduction", slug: "getting-started/introduction" }, + { label: "Installation", slug: "getting-started/installation" }, + { label: "Quick Start", slug: "getting-started/quickstart" }, + ], + }, + { + label: "XML Schema", + items: [ + { label: "Overview", slug: "schema/overview" }, + { label: "Reference", slug: "schema/reference" }, + ], + }, + { + label: "Guides", + items: [ + { label: "Compiling Dictionaries", slug: "guides/compiling" }, + { label: "Looking Up Entries", slug: "guides/lookup" }, + { label: "Searching Dictionaries", slug: "guides/search" }, + { label: "Tokenizing Text", slug: "guides/tokenize" }, + ], + }, + { + label: "CLI", + items: [{ label: "Command Reference", slug: "cli/reference" }], + }, + { + label: "API Reference", + items: [ + { label: "Rust", slug: "api/rust" }, + { label: "Python", slug: "api/python" }, + { label: "JavaScript", slug: "api/javascript" }, + ], + }, + ], + }), + ], +}); diff --git a/docs/package.json b/docs/package.json new file mode 100644 index 00000000..e4eba79e --- /dev/null +++ b/docs/package.json @@ -0,0 +1,19 @@ +{ + "name": "@odict/docs", + "type": "module", + "version": "0.0.1", + "private": true, + "scripts": { + "dev": "npm run generate && astro dev", + "start": "npm run generate && astro dev", + "build": "npm run generate && astro build", + "preview": "astro preview", + "astro": "astro", + "generate": "node scripts/generate-schema-docs.mjs && node scripts/generate-cli-docs.mjs" + }, + "dependencies": { + "@astrojs/starlight": "^0.32.0", + "astro": "^5.3.0", + "sharp": "^0.33.0" + } +} diff --git a/docs/python-api-generated.md b/docs/python-api-generated.md new file mode 100644 index 00000000..93d0c99c --- /dev/null +++ b/docs/python-api-generated.md @@ -0,0 +1,450 @@ +# Python API + +*Auto-generated from Rust doc comments.* + +--- + +## Functions + +### `compile()` + +Compiles an ODXML string into binary `.odict` data. + +Takes an XML string conforming to the ODict XML schema and returns +the compiled binary representation as a byte vector. The resulting +bytes can be passed to [`OpenDictionary::new`] or saved to disk. + +# Errors + +Returns an error if the XML is malformed or does not conform to the +ODict schema. + +## `OpenDictionary` + +The main class for working with compiled ODict dictionaries. + +An `OpenDictionary` wraps a compiled binary dictionary and provides +methods for looking up terms, full-text search, tokenization, and more. + +# Construction + +Create from compiled bytes or an XML string using [`OpenDictionary::new`], +or load from a file path or remote registry using [`OpenDictionary::load`]. + +### Methods + +#### `load()` + +Loads a dictionary from a file path, alias, or remote identifier. + +This is an async method. If `dictionary` is a path to a `.odict` file, +it loads from disk. If it matches the format `org/lang` (e.g. `wiktionary/eng`), +it downloads from the remote registry. + +#### `new()` + +Creates a dictionary from compiled binary data or directly from an XML string. + +Accepts either `bytes` (as returned by [`compile`]) or a `str` containing +ODXML markup. + +#### `save()` + +Saves the dictionary to disk as a `.odict` file. + +Optionally configure Brotli compression via `quality` (0–11) and +`window_size` (0–22). + +#### `min_rank()` + +The minimum rank value across all entries, or `None` if no entries have ranks. + +#### `max_rank()` + +The maximum rank value across all entries, or `None` if no entries have ranks. + +#### `lookup()` + +Looks up one or more terms by exact match. + +- `query` — a single term or list of terms to look up. +- `split` — minimum word length for compound splitting. +- `follow` — follow `see_also` cross-references until an entry with etymologies is found. +- `insensitive` — enable case-insensitive matching. + +#### `lexicon()` + +Returns all terms defined in the dictionary, sorted alphabetically. + +#### `index()` + +Creates a full-text search index for the dictionary. + +Must be called before [`OpenDictionary::search`]. + +#### `search()` + +Runs a full-text search across the dictionary. + +Requires an index — call [`OpenDictionary::index`] first. + +#### `tokenize()` + +Tokenizes text using NLP-based segmentation and matches each token against the dictionary. + +Supports Chinese, Japanese, Korean, Thai, Khmer, German, Swedish, +and Latin-script languages. + +- `text` — the text to tokenize. +- `follow` — follow `see_also` cross-references. Accepts `True`/`False` or a number (nonzero = follow). +- `insensitive` — enable case-insensitive matching. + +--- + +## `Definition` + +A single definition of a word sense. + +Contains the definition text along with optional examples and notes. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `id` | `str | None` | Optional identifier for this definition. | +| `value` | `str` | The definition text. | +| `examples` | `list[Example]` | Usage examples illustrating this definition. | +| `notes` | `list[Note]` | Additional notes about this definition. | + +--- + +## `Entry` + +A dictionary entry representing a single headword and its associated data. + +Each entry contains the term itself, optional ranking metadata, +cross-reference information, etymologies, and media attachments. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `term` | `str` | The headword for this entry. | +| `rank` | `int | None` | Optional frequency rank for ordering entries. | +| `see_also` | `str | None` | Cross-reference target term, if this entry redirects to another. | +| `etymologies` | `list[Etymology]` | The etymologies associated with this entry. | +| `media` | `list[MediaURL]` | Media URLs (audio, images, etc.) associated with this entry. | + +--- + +## `EnumWrapper` + +A wrapper for ODict enumeration values (e.g. part of speech, pronunciation kind). + +ODict enums are represented as string triples: the enum name, +the variant name, and the variant's string value. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `name` | `str` | The enum type name (e.g. `"PartOfSpeech"`). | +| `variant` | `str` | The variant name (e.g. `"Noun"`). | +| `value` | `str` | The string value of the variant (e.g. `"n"`). | + +--- + +## `Etymology` + +An etymology grouping for a dictionary entry. + +Etymologies group together senses that share a common word origin. +Each etymology can have its own pronunciations and description. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `id` | `str | None` | Optional identifier for this etymology. | +| `pronunciations` | `list[Pronunciation]` | Pronunciations associated with this etymology. | +| `description` | `str | None` | Optional description of the word origin. | +| `senses` | `list[Sense]` | The senses (meanings) under this etymology. | + +--- + +## `Example` + +A usage example illustrating a definition. + +Examples can optionally include translations and pronunciations. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `value` | `str` | The example text. | +| `translations` | `list[Translation]` | Translations of this example into other languages. | +| `pronunciations` | `list[Pronunciation]` | Pronunciations for this example. | + +--- + +## `Form` + +An inflected or alternate form of a word. + +Forms represent morphological variants such as plurals, conjugations, +or other inflections. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `term` | `str` | The inflected form text. | +| `kind` | `EnumWrapper | None` | The kind of form (e.g. plural, past tense), or `None`. | +| `tags` | `list[str]` | Tags for categorizing this form. | + +--- + +## `Group` + +A named group of related definitions. + +Groups allow organizing multiple definitions under a shared description, +such as grouping definitions by semantic domain. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `id` | `str | None` | Optional identifier for this group. | +| `description` | `str` | A description of what this group of definitions has in common. | +| `definitions` | `list[Definition]` | The definitions within this group. | + +--- + +## `IndexOptions` + +Options for configuring full-text index creation. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `directory` | `str | None` | Custom directory for storing the index. | +| `memory` | `int | None` | Memory arena size per thread in bytes (must be >15 MB). | +| `overwrite` | `bool | None` | Whether to overwrite an existing index. | + +--- + +## `RemoteLoadOptions` + +Options for loading dictionaries from remote registries. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `out_dir` | `str | None` | Custom output directory for downloaded files. | +| `caching` | `bool | None` | Whether to cache downloaded dictionaries locally. | +| `retries` | `int | None` | Number of download retries on failure. | + +--- + +## `LoadOptions` + +Options for loading a dictionary from a file path, alias, or remote registry. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `config_dir` | `str | None` | Custom configuration directory. | +| `remote` | `RemoteLoadOptions | None` | Options for remote dictionary loading. | + +--- + +## `LookupOptions` + +Options for configuring term lookups. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `split` | `int | None` | Minimum word length for compound splitting. | +| `follow` | `bool | None` | Whether to follow `see_also` cross-references. | +| `insensitive` | `bool | None` | Whether to enable case-insensitive matching. | + +--- + +## `LookupResult` + +The result of a dictionary lookup. + +Contains the matched entry and, if a `see_also` redirect was followed, +the original entry that initiated the redirect. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `entry` | `Entry` | The matched dictionary entry. | +| `directed_from` | `Entry | None` | The original entry if a `see_also` redirect was followed, or `None`. | + +--- + +## `MediaURL` + +A reference to an external media resource (audio, image, etc.). + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `src` | `str` | URL or path to the media file. | +| `mime_type` | `str | None` | MIME type (e.g. `audio/mpeg`), or `None`. | +| `description` | `str | None` | Human-readable description of the media. | + +--- + +## `Note` + +An additional note attached to a definition. + +Notes provide supplementary information such as usage guidance, +historical context, or grammatical remarks. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `id` | `str | None` | Optional identifier for this note. | +| `value` | `str` | The note text. | +| `examples` | `list[Example]` | Examples associated with this note. | + +--- + +## `Pronunciation` + +A pronunciation entry for a word or etymology. + +Represents how a word is pronounced in a given notation system +(e.g. IPA, Pinyin), with optional audio media. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `kind` | `EnumWrapper | None` | The pronunciation system (e.g. IPA, Pinyin), or `None`. | +| `value` | `str` | The pronunciation notation string. | +| `media` | `list[MediaURL]` | Audio media URLs for this pronunciation. | + +--- + +## `CompressOptions` + +Brotli compression options for saving dictionaries. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `quality` | `int | None` | Compression quality level (0–11). | +| `window_size` | `int | None` | Compression window size (0–22). | + +--- + +## `SaveOptions` + +Options for saving a dictionary to disk. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `compress` | `CompressOptions | None` | Optional Brotli compression settings. | + +--- + +## `SearchOptions` + +Options for configuring full-text search. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `directory` | `str | None` | Custom directory for the search index. | +| `threshold` | `int | None` | Relevance score threshold for filtering results. | +| `autoindex` | `bool | None` | Whether to automatically create an index if one does not exist. | +| `limit` | `int | None` | Maximum number of results to return. | + +--- + +## `Sense` + +A word sense — a specific meaning grouped by part of speech. + +Senses represent distinct meanings of a word under a given etymology. +Each sense has a part of speech and contains definitions (or definition groups), +along with optional tags, translations, and inflected forms. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `pos` | `EnumWrapper` | The part of speech for this sense (e.g. noun, verb, adjective). | +| `lemma` | `str | None` | Optional lemma reference linking to another entry. | +| `definitions` | `list[Definition | Group]` | Definitions or definition groups under this sense. | +| `tags` | `list[str]` | Tags for categorizing or filtering this sense. | +| `translations` | `list[Translation]` | Translations of this sense into other languages. | +| `forms` | `list[Form]` | Inflected forms of the word under this sense. | + +--- + +## `Token` + +A token produced by NLP-based text segmentation. + +Each token represents a segment of the input text, with metadata about +its position, detected language and script, and any matching dictionary entries. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `lemma` | `str` | The original token text (lemma form). | +| `language` | `str | None` | Detected language code (e.g. `"eng"`), or `None` if unknown. | +| `entries` | `list[LookupResult]` | Matched dictionary entries for this token. | +| `kind` | `str` | The token kind (e.g. `"Word"`, `"Punctuation"`). | +| `script` | `str` | Detected script name (e.g. `"Latin"`, `"Han"`). | +| `start` | `int` | Start byte offset in the original text. | +| `end` | `int` | End byte offset in the original text. | + +--- + +## `TokenizeOptions` + +Options for configuring text tokenization. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `follow` | `bool | int | None` | Whether to follow `see_also` cross-references. Accepts `True`/`False` or a number (nonzero = follow). | +| `insensitive` | `bool | None` | Whether to enable case-insensitive matching. | + +--- + +## `Translation` + +A translation of a word, definition, or example into another language. + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `lang` | `str` | The BCP-47 language code (e.g. `"fra"`, `"deu"`). | +| `value` | `str` | The translated text. | + +--- diff --git a/docs/rustdoc-json-output.md b/docs/rustdoc-json-output.md new file mode 100644 index 00000000..f230d948 --- /dev/null +++ b/docs/rustdoc-json-output.md @@ -0,0 +1,275 @@ +# ODict Python API (from rustdoc JSON) + +*Generated from rustdoc JSON format v57* + +## `CompressOptions` + +Brotli compression options for saving dictionaries. + +| Field | Type | Description | +|-------|------|-------------| +| `quality` | `?` | Compression quality level (0–11). | +| `window_size` | `?` | Compression window size (0–22). | + +## `Definition` + +A single definition of a word sense. + +Contains the definition text along with optional examples and notes. + +| Field | Type | Description | +|-------|------|-------------| +| `id` | `?` | Optional identifier for this definition. | +| `value` | `?` | The definition text. | +| `examples` | `?` | Usage examples illustrating this definition. | +| `notes` | `?` | Additional notes about this definition. | + +## `Entry` + +A dictionary entry representing a single headword and its associated data. + +Each entry contains the term itself, optional ranking metadata, +cross-reference information, etymologies, and media attachments. + +| Field | Type | Description | +|-------|------|-------------| +| `term` | `?` | The headword for this entry. | +| `rank` | `?` | Optional frequency rank for ordering entries. | +| `see_also` | `?` | Cross-reference target term, if this entry redirects to another. | +| `etymologies` | `?` | The etymologies associated with this entry. | +| `media` | `?` | Media URLs (audio, images, etc.) associated with this entry. | + +## `EnumWrapper` + +A wrapper for ODict enumeration values (e.g. part of speech, pronunciation kind). + +ODict enums are represented as string triples: the enum name, +the variant name, and the variant's string value. + +| Field | Type | Description | +|-------|------|-------------| +| `name` | `?` | The enum type name (e.g. `"PartOfSpeech"`). | +| `variant` | `?` | The variant name (e.g. `"Noun"`). | +| `value` | `?` | The string value of the variant (e.g. `"n"`). | + +## `Etymology` + +An etymology grouping for a dictionary entry. + +Etymologies group together senses that share a common word origin. +Each etymology can have its own pronunciations and description. + +| Field | Type | Description | +|-------|------|-------------| +| `id` | `?` | Optional identifier for this etymology. | +| `pronunciations` | `?` | Pronunciations associated with this etymology. | +| `description` | `?` | Optional description of the word origin. | +| `senses` | `?` | The senses (meanings) under this etymology. | + +## `Example` + +A usage example illustrating a definition. + +Examples can optionally include translations and pronunciations. + +| Field | Type | Description | +|-------|------|-------------| +| `value` | `?` | The example text. | +| `translations` | `?` | Translations of this example into other languages. | +| `pronunciations` | `?` | Pronunciations for this example. | + +## `Form` + +An inflected or alternate form of a word. + +Forms represent morphological variants such as plurals, conjugations, +or other inflections. + +| Field | Type | Description | +|-------|------|-------------| +| `term` | `?` | The inflected form text. | +| `kind` | `?` | The kind of form (e.g. plural, past tense), or `None`. | +| `tags` | `?` | Tags for categorizing this form. | + +## `Group` + +A named group of related definitions. + +Groups allow organizing multiple definitions under a shared description, +such as grouping definitions by semantic domain. + +| Field | Type | Description | +|-------|------|-------------| +| `id` | `?` | Optional identifier for this group. | +| `description` | `?` | A description of what this group of definitions has in common. | +| `definitions` | `?` | The definitions within this group. | + +## `IndexOptions` + +Options for configuring full-text index creation. + +| Field | Type | Description | +|-------|------|-------------| +| `directory` | `?` | Custom directory for storing the index. | +| `memory` | `?` | Memory arena size per thread in bytes (must be >15 MB). | +| `overwrite` | `?` | Whether to overwrite an existing index. | + +## `LoadOptions` + +Options for loading a dictionary from a file path, alias, or remote registry. + +| Field | Type | Description | +|-------|------|-------------| +| `config_dir` | `?` | Custom configuration directory. | +| `remote` | `?` | Options for remote dictionary loading. | + +## `LookupOptions` + +Options for configuring term lookups. + +| Field | Type | Description | +|-------|------|-------------| +| `split` | `?` | Minimum word length for compound splitting. | +| `follow` | `?` | Whether to follow `see_also` cross-references. | +| `insensitive` | `?` | Whether to enable case-insensitive matching. | + +## `LookupResult` + +The result of a dictionary lookup. + +Contains the matched entry and, if a `see_also` redirect was followed, +the original entry that initiated the redirect. + +| Field | Type | Description | +|-------|------|-------------| +| `entry` | `?` | The matched dictionary entry. | +| `directed_from` | `?` | The original entry if a `see_also` redirect was followed, or `None`. | + +## `MediaURL` + +A reference to an external media resource (audio, image, etc.). + +| Field | Type | Description | +|-------|------|-------------| +| `src` | `?` | URL or path to the media file. | +| `mime_type` | `?` | MIME type (e.g. `audio/mpeg`), or `None`. | +| `description` | `?` | Human-readable description of the media. | + +## `Note` + +An additional note attached to a definition. + +Notes provide supplementary information such as usage guidance, +historical context, or grammatical remarks. + +| Field | Type | Description | +|-------|------|-------------| +| `id` | `?` | Optional identifier for this note. | +| `value` | `?` | The note text. | +| `examples` | `?` | Examples associated with this note. | + +## `OpenDictionary` + +The main class for working with compiled ODict dictionaries. + +An `OpenDictionary` wraps a compiled binary dictionary and provides +methods for looking up terms, full-text search, tokenization, and more. + +# Construction + +Create from compiled bytes or an XML string using [`OpenDictionary::new`], +or load from a file path or remote registry using [`OpenDictionary::load`]. + +## `Pronunciation` + +A pronunciation entry for a word or etymology. + +Represents how a word is pronounced in a given notation system +(e.g. IPA, Pinyin), with optional audio media. + +| Field | Type | Description | +|-------|------|-------------| +| `kind` | `?` | The pronunciation system (e.g. IPA, Pinyin), or `None`. | +| `value` | `?` | The pronunciation notation string. | +| `media` | `?` | Audio media URLs for this pronunciation. | + +## `RemoteLoadOptions` + +Options for loading dictionaries from remote registries. + +| Field | Type | Description | +|-------|------|-------------| +| `out_dir` | `?` | Custom output directory for downloaded files. | +| `caching` | `?` | Whether to cache downloaded dictionaries locally. | +| `retries` | `?` | Number of download retries on failure. | + +## `SaveOptions` + +Options for saving a dictionary to disk. + +| Field | Type | Description | +|-------|------|-------------| +| `compress` | `?` | Optional Brotli compression settings. | + +## `SearchOptions` + +Options for configuring full-text search. + +| Field | Type | Description | +|-------|------|-------------| +| `directory` | `?` | Custom directory for the search index. | +| `threshold` | `?` | Relevance score threshold for filtering results. | +| `autoindex` | `?` | Whether to automatically create an index if one does not exist. | +| `limit` | `?` | Maximum number of results to return. | + +## `Sense` + +A word sense — a specific meaning grouped by part of speech. + +Senses represent distinct meanings of a word under a given etymology. +Each sense has a part of speech and contains definitions (or definition groups), +along with optional tags, translations, and inflected forms. + +| Field | Type | Description | +|-------|------|-------------| +| `pos` | `?` | The part of speech for this sense (e.g. noun, verb, adjective). | +| `lemma` | `?` | Optional lemma reference linking to another entry. | +| `definitions` | `?>` | Definitions or definition groups under this sense. | +| `tags` | `?` | Tags for categorizing or filtering this sense. | +| `translations` | `?` | Translations of this sense into other languages. | +| `forms` | `?` | Inflected forms of the word under this sense. | + +## `Token` + +A token produced by NLP-based text segmentation. + +Each token represents a segment of the input text, with metadata about +its position, detected language and script, and any matching dictionary entries. + +| Field | Type | Description | +|-------|------|-------------| +| `lemma` | `?` | The original token text (lemma form). | +| `language` | `?` | Detected language code (e.g. `"eng"`), or `None` if unknown. | +| `entries` | `?` | Matched dictionary entries for this token. | +| `kind` | `?` | The token kind (e.g. `"Word"`, `"Punctuation"`). | +| `script` | `?` | Detected script name (e.g. `"Latin"`, `"Han"`). | +| `start` | `usize` | Start byte offset in the original text. | +| `end` | `usize` | End byte offset in the original text. | + +## `TokenizeOptions` + +Options for configuring text tokenization. + +| Field | Type | Description | +|-------|------|-------------| +| `follow` | `?>` | Whether to follow `see_also` cross-references. Accepts `True`/`False` or a number (nonzero = follow). | +| `insensitive` | `?` | Whether to enable case-insensitive matching. | + +## `Translation` + +A translation of a word, definition, or example into another language. + +| Field | Type | Description | +|-------|------|-------------| +| `lang` | `?` | The BCP-47 language code (e.g. `"fra"`, `"deu"`). | +| `value` | `?` | The translated text. | diff --git a/docs/scripts/generate-cli-docs.mjs b/docs/scripts/generate-cli-docs.mjs new file mode 100644 index 00000000..9672b774 --- /dev/null +++ b/docs/scripts/generate-cli-docs.mjs @@ -0,0 +1,454 @@ +/** + * Generates CLI reference documentation by parsing the clap arg definitions + * directly from the Rust source files in cli/src/. + * + * Run: node scripts/generate-cli-docs.mjs + * + * Outputs: src/content/docs/cli/reference.md + */ + +import { readFileSync, writeFileSync, mkdirSync, readdirSync } from "node:fs"; +import { join, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const cliSrc = join(__dirname, "../../cli/src"); +const outPath = join(__dirname, "../src/content/docs/cli/reference.md"); + +// --------------------------------------------------------------------------- +// Read all Rust source files +// --------------------------------------------------------------------------- + +function readRustFile(relPath) { + return readFileSync(join(cliSrc, relPath), "utf-8"); +} + +// --------------------------------------------------------------------------- +// Parse the Commands enum from cli.rs to get command descriptions +// --------------------------------------------------------------------------- + +function parseCommandsEnum(source) { + const commands = {}; + // Match: /// doc comment followed by variant name + const re = /\/\/\/\s*(.*)\n\s*(?:#\[.*\]\n\s*)*(\w+)\((\w+)\)/g; + let m; + while ((m = re.exec(source)) !== null) { + const doc = m[1].trim(); + const variant = m[2]; + commands[variant] = doc; + } + return commands; +} + +// --------------------------------------------------------------------------- +// Parse #[arg(...)] fields from an Args struct +// --------------------------------------------------------------------------- + +function parseArgsStruct(source) { + const fields = []; + + // Find the struct body (everything between the first { and last }) + const structMatch = source.match( + /pub\s+struct\s+\w+Args\s*\{([\s\S]*?)\n\}/ + ); + if (!structMatch) return fields; + + const body = structMatch[1]; + + // Split by field declarations - each field may have preceding attributes and doc comments + // We look for patterns like: + // /// doc comment + // #[arg(...)] + // pub field_name: Type, + // -- or -- + // #[arg(..., help = "...")] + // field_name: Type, + + const fieldRegex = + /((?:\/\/\/[^\n]*\n\s*|#\[(?:arg|pyo3)[^\]]*\]\n\s*)*)\s*(?:pub(?:\((?:super|crate)\))?\s+)?(\w+)\s*:\s*([^,\n]+)/g; + + let fm; + while ((fm = fieldRegex.exec(body)) !== null) { + const attrs = fm[1]; + const name = fm[2]; + const type = fm[3].trim(); + + // Skip command subcommand fields + if (attrs.includes("#[command")) continue; + + // Parse #[arg(...)] attributes + const argAttr = attrs.match(/#\[arg\(([\s\S]*?)\)\]/); + const argContent = argAttr ? argAttr[1] : ""; + + // Extract help text + let help = extractQuoted(argContent, "help"); + + // Fall back to /// doc comments + if (!help) { + const docMatch = attrs.match(/\/\/\/\s*(.*)/); + if (docMatch) help = docMatch[1].trim(); + } + + // Extract short flag + let short = null; + const shortMatch = argContent.match( + /short\s*=\s*'([^']+)'/ + ); + if (shortMatch) { + short = `-${shortMatch[1]}`; + } else if (/\bshort\b/.test(argContent) && !/short\s*=/.test(argContent)) { + // bare `short` means use first char of field name + short = `-${name[0]}`; + } + + // Extract long flag + let long = null; + const longMatch = argContent.match( + /long\s*=\s*"([^"]+)"/ + ); + if (longMatch) { + long = `--${longMatch[1]}`; + } else if (/\blong\b/.test(argContent) && !/long\s*=/.test(argContent)) { + // bare `long` means use field name with _ -> - + long = `--${name.replace(/_/g, "-")}`; + } + + // Check if required + const required = + argContent.includes("required = true") || + (type !== "bool" && + !type.startsWith("Option<") && + !type.startsWith("Vec<") && + !short && + !long && + !argContent.includes("default_value")); + + // Check for default value + let defaultVal = null; + const defaultMatch = argContent.match( + /default_value_t\s*=\s*([^,\)]+)/ + ); + if (defaultMatch) { + defaultVal = defaultMatch[1].trim(); + // Clean up Rust-specific patterns + defaultVal = defaultVal + .replace(/crate::DEFAULT_RETRIES/, "3") + .replace(/DEFAULT_INDEX_MEMORY/, "15000000") + .replace(/DumpFormat::XML/, "xml") + .replace(/PrintFormat::Print/, "print") + .replace(/PrintFormat::JSON/, "json"); + } + + // Determine if this is a positional arg or a flag + const isPositional = !short && !long && !argContent.includes("default_value_t") && type !== "bool"; + + // Extract value_enum + const isValueEnum = argContent.includes("value_enum"); + + // Determine the arg type for display + let argType = null; + if (type === "bool" || type === "Option") { + argType = null; // boolean flags don't take a value + } else if (isValueEnum) { + argType = `<${name}>`; + } else if (type.includes("PathBuf") || type.includes("String")) { + argType = `<${name}>`; + } else if (type.includes("u32") || type.includes("usize") || type.includes("u16")) { + argType = `<${name}>`; + } else if (type.includes("Vec")) { + argType = `<${name}...>`; + } + + // Extract value_parser range info for help + const rangeMatch = argContent.match(/value_parser.*?range\((\d+)\.\.=(\d+)\)/); + if (rangeMatch) { + const rangeInfo = `(${rangeMatch[1]}–${rangeMatch[2]})`; + if (help && !help.includes(rangeMatch[1])) { + help = `${help} ${rangeInfo}`; + } + } + + fields.push({ + name, + type, + short, + long, + help: help || "", + required, + isPositional, + defaultVal, + argType, + }); + } + + return fields; +} + +function extractQuoted(text, key) { + // Match: key = "value" where value may span multiple lines due to formatting + const re = new RegExp(`${key}\\s*=\\s*"([^"]*)"`, "s"); + const m = re.exec(text); + return m ? m[1].trim() : null; +} + +// --------------------------------------------------------------------------- +// Parse the AliasCommands enum +// --------------------------------------------------------------------------- + +function parseAliasCommands(source) { + const commands = {}; + const re = /\/\/\/\s*(.*)\n\s*(?:#\[.*\]\n\s*)*(\w+)\((\w+)\)/g; + let m; + while ((m = re.exec(source)) !== null) { + commands[m[2]] = m[1].trim(); + } + return commands; +} + +// --------------------------------------------------------------------------- +// Parse HTTP serve endpoint structs from serve/ directory +// --------------------------------------------------------------------------- + +function parseServeEndpoints() { + const endpoints = []; + + for (const file of ["lookup.rs", "search.rs", "tokenize.rs"]) { + const source = readRustFile(`serve/${file}`); + + // Extract route path: #[get("/{name}/...")] + const routeMatch = source.match(/#\[get\("([^"]+)"\)\]/); + if (!routeMatch) continue; + const route = routeMatch[1]; + + // Extract request struct fields + const structMatch = source.match( + /pub\s+struct\s+(\w+Request)\s*\{([\s\S]*?)\}/ + ); + if (!structMatch) continue; + + const structName = structMatch[1]; + const body = structMatch[2]; + + const params = []; + const fieldRe = /(\w+)\s*:\s*([^,\n]+)/g; + let fm; + while ((fm = fieldRe.exec(body)) !== null) { + const name = fm[1]; + const type = fm[2].trim().replace(/,$/, ""); + const isOptional = type.startsWith("Option<"); + const innerType = isOptional + ? type.match(/Option<(\w+)>/)?.[1] || type + : type; + params.push({ + name, + type: innerType === "String" ? "string" : innerType === "bool" ? "boolean" : "number", + optional: isOptional, + }); + } + + endpoints.push({ route, params }); + } + + return endpoints; +} + +// --------------------------------------------------------------------------- +// Build CLI documentation from parsed source +// --------------------------------------------------------------------------- + +const cliSource = readRustFile("cli.rs"); +const commandDescs = parseCommandsEnum(cliSource); +const aliasSource = readRustFile("alias/alias.rs"); +const aliasDescs = parseAliasCommands(aliasSource); + +// Map command variant names to their source files +const commandFiles = { + Compile: "compile.rs", + Download: "download.rs", + Dump: "dump.rs", + Index: "index.rs", + Info: "info.rs", + Lexicon: "lexicon.rs", + Lookup: "lookup.rs", + Merge: "merge.rs", + New: "new.rs", + Search: "search.rs", + Serve: "serve/mod.rs", + Tokenize: "tokenize.rs", +}; + +const aliasFiles = { + Add: "alias/set.rs", + Set: "alias/set.rs", + Delete: "alias/delete.rs", +}; + +// Parse serve HTTP endpoints +const serveEndpoints = parseServeEndpoints(); + +// --------------------------------------------------------------------------- +// Render Markdown +// --------------------------------------------------------------------------- + +let md = `--- +title: CLI Reference +description: Complete reference for the ODict command-line interface. +--- + +{/* This file is auto-generated by scripts/generate-cli-docs.mjs — do not edit manually. */} + +\`\`\` +odict [OPTIONS] +\`\`\` + +The ODict CLI is the primary tool for creating, compiling, and querying ODict dictionaries. + +## Global options + +| Option | Description | +|--------|-------------| +| \`-q, --quiet\` | Silence any non-important output | +| \`-h, --help\` | Print help | +| \`-V, --version\` | Print version | + +--- + +## Commands + +`; + +// Render each main command +for (const [variant, file] of Object.entries(commandFiles)) { + const source = readRustFile(file); + const fields = parseArgsStruct(source); + const desc = commandDescs[variant] || variant; + const cmdName = variant.toLowerCase(); + + md += `### \`odict ${cmdName}\`\n\n`; + md += `${desc}.\n\n`; + + // Build usage string + const positionals = fields.filter((f) => f.isPositional); + const options = fields.filter((f) => !f.isPositional); + let usage = `odict ${cmdName}`; + for (const p of positionals) { + if (p.type.includes("Vec<")) { + usage += p.required ? ` <${p.name}...>` : ` [${p.name}...]`; + } else { + usage += p.required ? ` <${p.name}>` : ` [${p.name}]`; + } + } + for (const o of options) { + if (o.name === "retries") continue; // skip common retries flag in usage + const flag = o.short || o.long; + if (flag) { + if (o.argType) { + usage += ` [${flag} ${o.argType}]`; + } else { + usage += ` [${flag}]`; + } + } + } + md += `\`\`\`\n${usage}\n\`\`\`\n\n`; + + // Positional arguments table + if (positionals.length > 0) { + md += `#### Arguments\n\n`; + md += `| Argument | Required | Description |\n`; + md += `|----------|----------|-------------|\n`; + for (const p of positionals) { + md += `| \`${p.name}\` | ${p.required ? "Yes" : "No"} | ${p.help} |\n`; + } + md += `\n`; + } + + // Options table + if (options.length > 0) { + md += `#### Options\n\n`; + md += `| Flag | Description |\n`; + md += `|------|-------------|\n`; + for (const o of options) { + const flags = [o.short, o.long].filter(Boolean).join(", "); + let desc = o.help; + if (o.defaultVal && !desc.includes("default")) { + desc += ` (default: \`${o.defaultVal}\`)`; + } + md += `| \`${flags}\` | ${desc} |\n`; + } + md += `\n`; + } + + // HTTP endpoints for serve command + if (cmdName === "serve" && serveEndpoints.length > 0) { + md += `#### HTTP endpoints\n\n`; + md += `When running \`odict serve\`, the following REST endpoints become available. All return JSON.\n\n`; + + for (const ep of serveEndpoints) { + md += `##### \`GET ${ep.route}\`\n\n`; + md += `| Parameter | Type | Required | Description |\n`; + md += `|-----------|------|----------|-------------|\n`; + for (const p of ep.params) { + md += `| \`${p.name}\` | ${p.type} | ${p.optional ? "No" : "Yes"} | |\n`; + } + md += `\n`; + } + } + + md += `---\n\n`; +} + +// Render alias subcommands +md += `### \`odict alias\`\n\n`; +md += `Manage dictionary aliases.\n\n`; + +for (const [variant, file] of Object.entries(aliasFiles)) { + const source = readRustFile(file); + const fields = parseArgsStruct(source); + const desc = aliasDescs[variant] || variant; + const cmdName = variant.toLowerCase(); + + md += `#### \`odict alias ${cmdName}\`\n\n`; + md += `${desc}.\n\n`; + + // Build usage + const positionals = fields.filter((f) => f.isPositional); + const options = fields.filter((f) => !f.isPositional); + let usage = `odict alias ${cmdName}`; + for (const p of positionals) { + usage += p.required ? ` <${p.name}>` : ` [${p.name}]`; + } + md += `\`\`\`\n${usage}\n\`\`\`\n\n`; + + if (positionals.length > 0) { + md += `| Argument | Required | Description |\n`; + md += `|----------|----------|-------------|\n`; + for (const p of positionals) { + md += `| \`${p.name}\` | ${p.required ? "Yes" : "No"} | ${p.help} |\n`; + } + md += `\n`; + } + + if (options.length > 0) { + md += `| Flag | Description |\n`; + md += `|------|-------------|\n`; + for (const o of options) { + const flags = [o.short, o.long].filter(Boolean).join(", "); + let desc = o.help; + if (o.defaultVal && !desc.includes("default")) { + desc += ` (default: \`${o.defaultVal}\`)`; + } + md += `| \`${flags}\` | ${desc} |\n`; + } + md += `\n`; + } +} + +md += `---\n`; + +// --------------------------------------------------------------------------- +// Write output +// --------------------------------------------------------------------------- + +mkdirSync(dirname(outPath), { recursive: true }); +writeFileSync(outPath, md, "utf-8"); +console.log(`✅ Generated CLI reference → ${outPath}`); diff --git a/docs/scripts/generate-schema-docs.mjs b/docs/scripts/generate-schema-docs.mjs new file mode 100644 index 00000000..2bcce7c3 --- /dev/null +++ b/docs/scripts/generate-schema-docs.mjs @@ -0,0 +1,423 @@ +/** + * Parses odict.xsd and lib/src/schema/pos.rs to generate + * a Markdown reference page for the ODict XML schema. + * + * Run: node scripts/generate-schema-docs.mjs + * + * Outputs: src/content/docs/schema/reference.md + */ + +import { readFileSync, writeFileSync, mkdirSync } from "node:fs"; +import { join, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const xsdPath = join(__dirname, "../../odict.xsd"); +const posPath = join(__dirname, "../../lib/src/schema/pos.rs"); +const outPath = join(__dirname, "../src/content/docs/schema/reference.md"); + +const xsd = readFileSync(xsdPath, "utf-8"); +const posSource = readFileSync(posPath, "utf-8"); + +// --------------------------------------------------------------------------- +// Tokenize XSD into open / close / self-closing tags +// --------------------------------------------------------------------------- + +function tokenize(xml) { + const tokens = []; + const re = /<(\/?)(\w[\w:.]*)((?:\s+[\w:]+\s*=\s*"[^"]*")*)\s*(\/?)>/g; + let m; + while ((m = re.exec(xml)) !== null) { + const isClose = m[1] === "/"; + const tag = m[2]; + const attrStr = m[3]; + const isSelfClose = m[4] === "/"; + + const attrs = {}; + const attrRe = /([\w:]+)\s*=\s*"([^"]*)"/g; + let am; + while ((am = attrRe.exec(attrStr)) !== null) { + attrs[am[1]] = am[2]; + } + + if (isClose) { + tokens.push({ type: "close", tag, attrs }); + } else if (isSelfClose) { + tokens.push({ type: "selfclose", tag, attrs }); + } else { + tokens.push({ type: "open", tag, attrs }); + } + } + return tokens; +} + +const tokens = tokenize(xsd); + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/** Find the index of the matching close tag for an open tag at `openIndex`. */ +function findClose(openIndex) { + const openTag = tokens[openIndex].tag; + let depth = 1; + for (let i = openIndex + 1; i < tokens.length; i++) { + if (tokens[i].tag === openTag) { + if (tokens[i].type === "open") depth++; + else if (tokens[i].type === "close") { + depth--; + if (depth === 0) return i; + } + } + } + return -1; +} + +/** + * Parse a complexType range (ctOpen..ctClose) for its direct attributes + * and direct child elements. "Direct" means not inside a nested + * xs:complexType — we track xs:complexType nesting depth and only + * collect items at depth 0. + */ +function parseComplexTypeRange(ctOpen, ctClose) { + const attributes = []; + const children = []; + let depth = 0; + + for (let i = ctOpen + 1; i < ctClose; i++) { + const t = tokens[i]; + + // Track nesting of inner xs:complexType blocks + if (t.tag === "xs:complexType") { + if (t.type === "open") depth++; + else if (t.type === "close") depth--; + continue; + } + + if (depth > 0) continue; // inside a nested complexType — skip + + // Collect attributes at depth 0 + if ( + t.tag === "xs:attribute" && + t.attrs.name && + (t.type === "selfclose" || t.type === "open") + ) { + attributes.push({ + name: t.attrs.name, + type: (t.attrs.type || "xs:string").replace("xs:", ""), + required: t.attrs.use === "required", + }); + } + + // Collect child elements at depth 0 + if ( + t.tag === "xs:element" && + t.attrs.name && + (t.type === "selfclose" || t.type === "open") + ) { + children.push({ + name: t.attrs.name, + type: t.attrs.type || undefined, + minOccurs: t.attrs.minOccurs ?? "1", + maxOccurs: t.attrs.maxOccurs ?? "1", + }); + + // Skip past this element's entire subtree + if (t.type === "open") { + i = findClose(i); + } + } + } + + return { attributes, children }; +} + +// --------------------------------------------------------------------------- +// Phase 1: Parse named complexTypes (defined at XSD top-level) +// --------------------------------------------------------------------------- + +const namedTypes = new Map(); + +for (let i = 0; i < tokens.length; i++) { + const t = tokens[i]; + if (t.tag === "xs:complexType" && t.type === "open" && t.attrs.name) { + const ctClose = findClose(i); + const { attributes, children } = parseComplexTypeRange(i, ctClose); + namedTypes.set(t.attrs.name, { attributes, children }); + i = ctClose; + } +} + +// --------------------------------------------------------------------------- +// Phase 2: Recursively walk the root element to build element map +// --------------------------------------------------------------------------- + +const elements = new Map(); + +/** Resolve a named complexType into an element descriptor and register children. */ +function resolveNamedType(typeName) { + const type = namedTypes.get(typeName); + if (!type) return { attributes: [], children: [] }; + + const children = type.children.map((c) => ({ + name: c.name, + minOccurs: c.minOccurs, + maxOccurs: c.maxOccurs, + })); + + // Recursively register child elements that reference named types + for (const child of type.children) { + if (!elements.has(child.name) && child.type && namedTypes.has(child.type)) { + elements.set(child.name, resolveNamedType(child.type)); + } else if (!elements.has(child.name)) { + elements.set(child.name, { attributes: [], children: [] }); + } + } + + return { attributes: [...type.attributes], children }; +} + +/** Process an xs:element token at `index` and register it in the elements map. */ +function processElement(index) { + const t = tokens[index]; + const name = t.attrs.name; + const type = t.attrs.type; + + if (elements.has(name)) return; + + // Self-closing element or element with a named type + if (t.type === "selfclose") { + if (type && namedTypes.has(type)) { + elements.set(name, resolveNamedType(type)); + } else { + elements.set(name, { attributes: [], children: [] }); + } + return; + } + + // Open element with a named type (no inline complexType) + if (type && namedTypes.has(type)) { + elements.set(name, resolveNamedType(type)); + return; + } + + const elClose = findClose(index); + + // Find the inline xs:complexType within this element + for (let i = index + 1; i < elClose; i++) { + if (tokens[i].tag === "xs:complexType" && tokens[i].type === "open") { + const ctClose = findClose(i); + const { attributes, children } = parseComplexTypeRange(i, ctClose); + + elements.set(name, { + attributes, + children: children.map((c) => ({ + name: c.name, + minOccurs: c.minOccurs, + maxOccurs: c.maxOccurs, + })), + }); + + // Recursively process child elements found at depth 0 + let depth = 0; + for (let j = i + 1; j < ctClose; j++) { + if (tokens[j].tag === "xs:complexType") { + if (tokens[j].type === "open") depth++; + else if (tokens[j].type === "close") depth--; + continue; + } + if (depth > 0) continue; + + if ( + tokens[j].tag === "xs:element" && + tokens[j].attrs.name && + (tokens[j].type === "selfclose" || tokens[j].type === "open") + ) { + processElement(j); + if (tokens[j].type === "open") { + j = findClose(j); + } + } + } + + break; + } + } +} + +// Find the root and process it +for (let i = 0; i < tokens.length; i++) { + const t = tokens[i]; + if ( + t.tag === "xs:element" && + t.attrs.name === "dictionary" && + (t.type === "open" || t.type === "selfclose") + ) { + processElement(i); + break; + } +} + +// --------------------------------------------------------------------------- +// Parse POS codes from lib/src/schema/pos.rs +// --------------------------------------------------------------------------- + +function parsePosEnum(source) { + const entries = []; + const re = + /#\[strum\(to_string\s*=\s*"([^"]+)"\)\]\s*(?:#\[.*\]\s*)*(\w+)/g; + let m; + while ((m = re.exec(source)) !== null) { + const label = m[1]; + const variant = m[2]; + if (variant === "Other") continue; + entries.push({ variant, label }); + } + return entries; +} + +const allPos = parsePosEnum(posSource); + +const japaneseVariantPrefixes = [ + "AdjPn", "AdjKari", "AdjKu", "AdjNari", "AdjNa", "AdjShiku", + "AdjT", "AdjIx", "NAdv", "AdvTo", "AdjNo", "NPref", "NSuf", + "NT", "AdjF", "V5", "V1", "Vz", "Vk", "V2", "Vn", "Vr", + "VsC", "Vs", "VUnspec", "V4", +]; + +function isJapanese(variant) { + return japaneseVariantPrefixes.some( + (p) => variant === p || variant.startsWith(p) + ); +} + +const universalPos = allPos.filter((p) => !isJapanese(p.variant)); +const japanesePos = allPos.filter((p) => isJapanese(p.variant)); + +function variantToCode(variant) { + return variant + .replace(/([a-z])([A-Z])/g, "$1_$2") + .replace(/([A-Z]+)([A-Z][a-z])/g, "$1_$2") + .toLowerCase(); +} + +// --------------------------------------------------------------------------- +// Build element hierarchy tree (with deduplication via seen set) +// --------------------------------------------------------------------------- + +function buildTree(name, prefix = "", isLast = true, seen = new Set(), isRoot = true) { + const el = elements.get(name); + const connector = isRoot ? "" : isLast ? "└── " : "├── "; + + if (seen.has(name)) { + return `${prefix}${connector}${name} …\n`; + } + + let result = `${prefix}${connector}${name}\n`; + seen.add(name); + + if (!el || el.children.length === 0) return result; + + const childPrefix = isRoot ? "" : prefix + (isLast ? " " : "│ "); + + for (let i = 0; i < el.children.length; i++) { + const child = el.children[i]; + const childIsLast = i === el.children.length - 1; + result += buildTree(child.name, childPrefix, childIsLast, seen, false); + } + + return result; +} + +// --------------------------------------------------------------------------- +// Render Markdown +// --------------------------------------------------------------------------- + +const elementOrder = [ + "dictionary", "entry", "ety", "sense", "group", + "definition", "note", "example", "pronunciation", "url", +]; + +let md = `--- +title: XML Schema Reference +description: Complete reference for the ODict XML (ODXML) schema. +--- + +{/* This file is auto-generated by scripts/generate-schema-docs.mjs — do not edit manually. */} + +This page is automatically generated from [\`odict.xsd\`](https://github.com/TheOpenDictionary/odict/blob/main/odict.xsd) and [\`pos.rs\`](https://github.com/TheOpenDictionary/odict/blob/main/lib/src/schema/pos.rs). + +## Element hierarchy + +\`\`\` +${buildTree("dictionary").trimEnd()} +\`\`\` + +--- + +## Elements + +`; + +for (const name of elementOrder) { + const el = elements.get(name); + if (!el) continue; + + md += `### \`<${name}>\`\n\n`; + + if (el.attributes.length > 0) { + md += `#### Attributes\n\n`; + md += `| Attribute | Type | Required |\n`; + md += `|-----------|------|----------|\n`; + for (const a of el.attributes) { + md += `| \`${a.name}\` | \`${a.type}\` | ${a.required ? "Yes" : "No"} |\n`; + } + md += `\n`; + } + + if (el.children.length > 0) { + md += `#### Child elements\n\n`; + md += `| Element | Min | Max |\n`; + md += `|---------|-----|-----|\n`; + for (const c of el.children) { + md += `| [\`<${c.name}>\`](#${c.name}) | ${c.minOccurs} | ${c.maxOccurs} |\n`; + } + md += `\n`; + } + + md += `---\n\n`; +} + +// --------------------------------------------------------------------------- +// Parts of Speech +// --------------------------------------------------------------------------- + +md += `## Parts of speech\n\n`; +md += `The \`pos\` attribute on \`\` accepts the following values. You can also pass any custom string, which will be treated as a custom part of speech.\n\n`; + +md += `### Universal\n\n`; +md += `| Code | Label |\n`; +md += `|------|-------|\n`; +for (const p of universalPos) { + md += `| \`${variantToCode(p.variant)}\` | ${p.label} |\n`; +} +md += `\n`; + +md += `### Japanese-specific\n\n`; +md += `| Code | Label |\n`; +md += `|------|-------|\n`; +for (const p of japanesePos) { + md += `| \`${variantToCode(p.variant)}\` | ${p.label} |\n`; +} +md += `\n`; + +// --------------------------------------------------------------------------- +// Write output +// --------------------------------------------------------------------------- + +mkdirSync(dirname(outPath), { recursive: true }); +writeFileSync(outPath, md, "utf-8"); + +console.log( + `Generated schema reference -> ${outPath} (${elements.size} elements, ${allPos.length} POS codes)` +); diff --git a/docs/src/content.config.ts b/docs/src/content.config.ts new file mode 100644 index 00000000..a4eec59b --- /dev/null +++ b/docs/src/content.config.ts @@ -0,0 +1,6 @@ +import { defineCollection } from "astro:content"; +import { docsSchema } from "@astrojs/starlight/schema"; + +export const collections = { + docs: defineCollection({ schema: docsSchema() }), +}; diff --git a/docs/src/content/docs/api/javascript.md b/docs/src/content/docs/api/javascript.md new file mode 100644 index 00000000..2ca8c1a7 --- /dev/null +++ b/docs/src/content/docs/api/javascript.md @@ -0,0 +1,386 @@ +--- +title: JavaScript API +description: Using ODict from JavaScript/TypeScript via the @odict/node package. +--- + +The JavaScript bindings are distributed as `@odict/node` on npm. They are native extensions built with [NAPI-RS](https://napi.rs/) and also support the browser via WASI. + +## Installation + +```bash +npm install @odict/node +``` + +Requires Node.js 12+. Native binaries are included for all major platforms (macOS, Linux, Windows, ARM64, WASI). + +## Quick example + +```typescript +import { readFile } from "node:fs/promises"; +import { compile, OpenDictionary } from "@odict/node"; + +// Compile XML to a buffer +const xml = await readFile("my-dictionary.xml", "utf-8"); +const data = compile(xml); +const dictionary = new OpenDictionary(data); + +const results = dictionary.lookup("hello"); +console.log(results[0].entry.term); // "hello" +``` + +--- + +## Functions + +### `compile(xml: string): Buffer` + +Compiles an ODXML string into binary `.odict` data. Returns a `Buffer` that can be passed to `new OpenDictionary()`. + +```typescript +import { compile } from "@odict/node"; + +const data = compile(` + + + + + +`); +``` + +--- + +## `OpenDictionary` + +The main class for working with compiled dictionaries. + +### Constructors + +#### `new OpenDictionary(data: Buffer | string)` + +Creates a dictionary from compiled binary data (as returned by `compile()`) or directly from an XML string. + +```typescript +import { compile, OpenDictionary } from "@odict/node"; + +// From compiled buffer +const data = compile(xmlString); +const dictionary = new OpenDictionary(data); + +// Directly from XML string +const dictionary = new OpenDictionary(xmlString); +``` + +#### `OpenDictionary.load(dictionary: string, options?: LoadOptions): Promise` + +Loads a dictionary from a file path or remote identifier. Returns a `Promise`. + +- If `dictionary` is a path to a `.odict` file, it loads from disk. +- If it matches the format `org/lang` (e.g. `wiktionary/eng`), it downloads from the remote registry. + +```typescript +import { OpenDictionary } from "@odict/node"; + +// Load from file +const dictionary = await OpenDictionary.load("./my-dictionary.odict"); + +// Load from remote registry +const dictionary = await OpenDictionary.load("wiktionary/eng"); + +// Load with options +const dictionary = await OpenDictionary.load("wiktionary/eng", { + configDir: "./config", + remote: { caching: true, retries: 3 }, +}); +``` + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `minRank` | `number \| null` | The minimum rank value across all entries, or `null` if no entries have ranks | +| `maxRank` | `number \| null` | The maximum rank value across all entries, or `null` if no entries have ranks | + +### Methods + +#### `save(path: string, options?: SaveOptions): void` + +Saves the dictionary to disk as a `.odict` file. + +```typescript +dictionary.save("output.odict"); +dictionary.save("output.odict", { + compress: { quality: 11, windowSize: 22 }, +}); +``` + +#### `lookup(query: string | string[], options?: LookupOptions): LookupResult[]` + +Looks up one or more terms by exact match. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `query` | `string \| string[]` | — | Term(s) to look up | +| `options.split` | `number` | — | Minimum word length for compound splitting | +| `options.follow` | `boolean` | — | Follow `see` cross-references until an entry with etymologies is found | +| `options.insensitive` | `boolean` | — | Enable case-insensitive matching | + +```typescript +// Simple lookup +const results = dictionary.lookup("cat"); + +// Multiple terms +const results = dictionary.lookup(["cat", "dog"]); + +// Follow cross-references, case-insensitive +const results = dictionary.lookup("RaN", { + follow: true, + insensitive: true, +}); +// results[0].entry.term === "run" +// results[0].directedFrom?.term === "ran" + +// Compound word splitting +const results = dictionary.lookup("catdog", { split: 3 }); +``` + +#### `lexicon(): string[]` + +Returns all terms defined in the dictionary, sorted alphabetically. + +```typescript +const words = dictionary.lexicon(); +// ["cat", "dog", "run", ...] +``` + +#### `index(options?: IndexOptions): void` + +Creates a full-text search index for the dictionary. + +```typescript +dictionary.index(); +dictionary.index({ overwrite: true, memory: 50_000_000 }); +``` + +#### `search(query: string, options?: SearchOptions): Entry[]` + +Runs a full-text search. Requires an index (call `index()` first). + +```typescript +dictionary.index(); + +const results = dictionary.search("domesticated mammal"); +const results = dictionary.search("greeting", { limit: 5 }); +``` + +#### `tokenize(text: string, options?: TokenizeOptions): Token[]` + +Tokenizes text and matches each token against the dictionary. Supports Chinese, Japanese, Korean, Thai, Khmer, German, Swedish, and Latin-script languages. + +```typescript +const tokens = dictionary.tokenize("the cat ran"); +for (const token of tokens) { + console.log(token.lemma, token.entries); +} + +// With options +const tokens = dictionary.tokenize("DOG cat", { + insensitive: true, + follow: true, +}); +``` + +--- + +## Types + +### `LookupResult` + +```typescript +interface LookupResult { + entry: Entry; + directedFrom?: Entry; +} +``` + +### `Entry` + +```typescript +interface Entry { + term: string; + rank?: number; + seeAlso?: string; + etymologies: Etymology[]; + media: MediaURL[]; +} +``` + +### `Etymology` + +```typescript +interface Etymology { + id?: string; + pronunciations: Pronunciation[]; + description?: string; + senses: Sense[]; +} +``` + +### `Sense` + +```typescript +interface Sense { + pos: EnumWrapper; + lemma?: string; + definitions: Array; + tags: string[]; + translations: Translation[]; + forms: Form[]; +} +``` + +### `Definition` + +```typescript +interface Definition { + id?: string; + value: string; + examples: Example[]; + notes: Note[]; +} +``` + +### `Group` + +```typescript +interface Group { + id?: string; + description: string; + definitions: Definition[]; +} +``` + +### `Example` + +```typescript +interface Example { + value: string; + translations: Translation[]; + pronunciations: Pronunciation[]; +} +``` + +### `Note` + +```typescript +interface Note { + id?: string; + value: string; + examples: Example[]; +} +``` + +### `Pronunciation` + +```typescript +interface Pronunciation { + kind?: EnumWrapper; + value: string; + media: MediaUrl[]; +} +``` + +### `MediaUrl` + +```typescript +interface MediaUrl { + src: string; + mimeType?: string; + description?: string; +} +``` + +### `Token` + +```typescript +interface Token { + lemma: string; + language?: string; + entries: LookupResult[]; + kind: string; + script: string; + start: number; + end: number; +} +``` + +### `EnumWrapper` + +```typescript +interface EnumWrapper { + name: string; + variant: string; + value: string; +} +``` + +### Options + +```typescript +interface LoadOptions { + configDir?: string; + remote?: RemoteLoadOptions; +} + +interface RemoteLoadOptions { + outDir?: string; + caching?: boolean; + retries?: number; +} + +interface SaveOptions { + compress?: CompressOptions; +} + +interface CompressOptions { + quality?: number; + windowSize?: number; +} + +interface LookupOptions { + split?: number; + follow?: boolean; + insensitive?: boolean; +} + +interface IndexOptions { + directory?: string; + memory?: number; + overwrite?: boolean; +} + +interface SearchOptions { + directory?: string; + threshold?: number; + autoindex?: boolean; + limit?: number; +} + +interface TokenizeOptions { + follow?: boolean; + allowList?: string[]; + insensitive?: boolean; +} +``` + +## Browser support + +The `@odict/node` package also supports browser environments via WASI. Import from the browser entry point: + +```typescript +import { compile, OpenDictionary } from "@odict/node/browser"; +``` + +:::note +Browser support runs ODict compiled to WebAssembly via WASI. The `load()` method (which accesses the filesystem and network) is not available in the browser — use `new OpenDictionary(data)` with pre-compiled data instead. +::: diff --git a/docs/src/content/docs/api/python.md b/docs/src/content/docs/api/python.md new file mode 100644 index 00000000..64aae17e --- /dev/null +++ b/docs/src/content/docs/api/python.md @@ -0,0 +1,281 @@ +--- +title: Python API +description: Using ODict from Python via the theopendictionary package. +--- + +The Python bindings are distributed as the `theopendictionary` package on PyPI. They are native extensions built with [PyO3](https://pyo3.rs/). + +## Installation + +```bash +pip install theopendictionary +``` + +Requires Python 3.8.1+. + +## Quick example + +```python +from theopendictionary import OpenDictionary, compile + +# Compile XML to bytes +xml = """ + + + + + + + + + + + +""" + +compiled_bytes = compile(xml) +dictionary = OpenDictionary(compiled_bytes) + +results = dictionary.lookup("hello") +print(results[0].entry.term) # "hello" +print(results[0].entry.etymologies) # [Etymology(...)] +``` + +--- + +## Functions + +### `compile(xml: str) -> bytes` + +Compiles an ODXML string into binary `.odict` data (as a `bytes` object). This data can be passed to `OpenDictionary()` or saved to disk. + +```python +from theopendictionary import compile + +data = compile("") +``` + +--- + +## `OpenDictionary` + +The main class for working with compiled dictionaries. + +### Constructors + +#### `OpenDictionary(data: bytes | str)` + +Creates a dictionary from compiled binary data (as returned by `compile()`) or directly from an XML string. + +```python +from theopendictionary import OpenDictionary, compile + +# From compiled bytes +data = compile(xml_string) +dictionary = OpenDictionary(data) + +# Directly from XML string +dictionary = OpenDictionary(xml_string) +``` + +#### `await OpenDictionary.load(dictionary: str, options: LoadOptions | None = None) -> OpenDictionary` + +Loads a dictionary from a file path, alias, or remote identifier. This is an **async** method. + +- If `dictionary` is a path to a `.odict` file, it loads from disk. +- If it matches the format `org/lang` (e.g. `wiktionary/eng`), it downloads from the remote registry. + +```python +import asyncio +from theopendictionary import OpenDictionary, LoadOptions, RemoteLoadOptions + +async def main(): + # Load from file + dictionary = await OpenDictionary.load("./my-dictionary.odict") + + # Load from remote registry + dictionary = await OpenDictionary.load("wiktionary/eng") + + # Load with options + opts = LoadOptions( + config_dir="./config", + remote=RemoteLoadOptions(caching=True) + ) + dictionary = await OpenDictionary.load("wiktionary/eng", opts) + +asyncio.run(main()) +``` + +### Properties + +| Property | Type | Description | +|----------|------|-------------| +| `min_rank` | `int \| None` | The minimum rank value across all entries, or `None` if no entries have ranks | +| `max_rank` | `int \| None` | The maximum rank value across all entries, or `None` if no entries have ranks | + +### Methods + +#### `save(path: str, quality: int | None = None, window_size: int | None = None) -> None` + +Saves the dictionary to disk as a `.odict` file. Optionally configure Brotli compression. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `path` | `str` | — | Output file path | +| `quality` | `int \| None` | `None` | Brotli compression level (0–11) | +| `window_size` | `int \| None` | `None` | Brotli window size (0–22) | + +```python +dictionary.save("output.odict") +dictionary.save("output.odict", quality=11, window_size=22) +``` + +#### `lookup(query, split=None, follow=None, insensitive=None) -> list[LookupResult]` + +Looks up one or more terms by exact match. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `query` | `str \| list[str]` | — | Term(s) to look up | +| `split` | `int \| None` | `None` | Minimum word length for compound splitting | +| `follow` | `bool \| None` | `None` | Follow `see` cross-references until an entry with etymologies is found | +| `insensitive` | `bool \| None` | `None` | Enable case-insensitive matching | + +```python +# Simple lookup +results = dictionary.lookup("cat") + +# Multiple terms +results = dictionary.lookup(["cat", "dog"]) + +# Follow cross-references, case-insensitive +results = dictionary.lookup("RaN", follow=True, insensitive=True) +# results[0].entry.term == "run" +# results[0].directed_from.term == "ran" + +# Compound word splitting +results = dictionary.lookup("catdog", split=3) +``` + +#### `lexicon() -> list[str]` + +Returns all terms defined in the dictionary, sorted alphabetically. + +```python +words = dictionary.lexicon() +# ["cat", "dog", "run", ...] +``` + +#### `index(options=None) -> None` + +Creates a full-text search index for the dictionary. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `options` | `IndexOptions \| None` | `None` | Indexing configuration | + +```python +from theopendictionary import IndexOptions + +dictionary.index() +dictionary.index(IndexOptions(overwrite=True, memory=50_000_000)) +``` + +#### `search(query: str, options=None) -> list[Entry]` + +Runs a full-text search across the dictionary. Requires an index (call `index()` first). + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `query` | `str` | — | Search query | +| `options` | `SearchOptions \| None` | `None` | Search configuration | + +```python +from theopendictionary import SearchOptions + +dictionary.index() +results = dictionary.search("domesticated mammal") +results = dictionary.search("greeting", SearchOptions(limit=5)) +``` + +#### `tokenize(text: str, follow=None, insensitive=None) -> list[Token]` + +Tokenizes text using NLP-based segmentation and matches each token against the dictionary. Supports Chinese, Japanese, Korean, Thai, Khmer, German, Swedish, and Latin-script languages. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `text` | `str` | — | Text to tokenize | +| `follow` | `bool \| int \| None` | `None` | Follow `see` cross-references. Accepts `True`/`False` or a number (nonzero = follow) | +| `insensitive` | `bool \| None` | `None` | Case-insensitive matching | + +```python +tokens = dictionary.tokenize("the cat ran") +for token in tokens: + print(token.lemma, token.entries) +``` + +--- + +## Types + +### `LookupResult` + +| Property | Type | Description | +|----------|------|-------------| +| `entry` | `Entry` | The matched entry | +| `directed_from` | `Entry \| None` | The original entry if a `see` redirect was followed | + +### `Entry` + +| Property | Type | Description | +|----------|------|-------------| +| `term` | `str` | The headword | +| `rank` | `int \| None` | Optional frequency rank | +| `see_also` | `str \| None` | Cross-reference target term | +| `etymologies` | `list[Etymology]` | List of etymologies | +| `media` | `list[MediaURL]` | Media URLs | + +### `Token` + +| Property | Type | Description | +|----------|------|-------------| +| `lemma` | `str` | The original token text | +| `language` | `str \| None` | Detected language code | +| `script` | `str` | Detected script name | +| `kind` | `str` | Token kind | +| `start` | `int` | Start offset in the original text | +| `end` | `int` | End offset in the original text | +| `entries` | `list[LookupResult]` | Matched dictionary entries | + +### `IndexOptions` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `directory` | `str \| None` | `None` | Custom directory for the index | +| `memory` | `int \| None` | `None` | Memory arena per thread in bytes (must be >15MB) | +| `overwrite` | `bool \| None` | `None` | Overwrite existing index | + +### `SearchOptions` + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `directory` | `str \| None` | `None` | Custom index directory | +| `threshold` | `int \| None` | `None` | Relevance threshold | +| `autoindex` | `bool \| None` | `None` | Auto-create index if missing | +| `limit` | `int \| None` | `None` | Maximum results | + +### `Pronunciation` + +| Property | Type | Description | +|----------|------|-------------| +| `kind` | `EnumWrapper \| None` | The pronunciation system (e.g. IPA, Pinyin) | +| `value` | `str` | The pronunciation notation | +| `media` | `list[MediaURL]` | Audio URLs | + +### `MediaURL` + +| Property | Type | Description | +|----------|------|-------------| +| `src` | `str` | URL or path to the media file | +| `mime_type` | `str \| None` | MIME type (e.g. `audio/mpeg`) | +| `description` | `str \| None` | Description of the media | diff --git a/docs/src/content/docs/api/rust.md b/docs/src/content/docs/api/rust.md new file mode 100644 index 00000000..02d6ab25 --- /dev/null +++ b/docs/src/content/docs/api/rust.md @@ -0,0 +1,98 @@ +--- +title: Rust API +description: Using the ODict Rust crate. +--- + +The `odict` crate is the core library that powers the CLI and all language bindings. It is published on [crates.io](https://crates.io/crates/odict). + +## Installation + +Add to your `Cargo.toml`: + +```toml +[dependencies] +odict = "2" +``` + +## Documentation + +Full API documentation is available on **docs.rs**: + +**[docs.rs/odict](https://docs.rs/odict)** + +## Feature flags + +The `odict` crate uses feature flags to control which capabilities are compiled in. The `default` feature includes `sql` and `config`. + +| Feature | Description | +|---------|-------------| +| `default` | Enables `sql` and `config` | +| `sql` | SQL dump support (SQLite, PostgreSQL, MySQL) via sea-query | +| `config` | Access to platform-specific config directories | +| `alias` | Dictionary alias management (implies `config`) | +| `search` | Full-text search via Tantivy (implies `config`) | +| `markdown` | Markdown rendering support via pulldown-cmark | +| `html` | HTML output support (implies `markdown`) | +| `http` | Remote dictionary downloading (implies `config`) | +| `tokenize` | Full multi-language tokenization (enables all language tokenizers) | +| `tokenize-latin` | Latin-script tokenization | +| `tokenize-chinese` | Chinese segmentation | +| `tokenize-japanese` | Japanese segmentation (UniDic) | +| `tokenize-korean` | Korean segmentation | +| `tokenize-thai` | Thai segmentation | +| `tokenize-khmer` | Khmer segmentation | +| `tokenize-swedish` | Swedish recomposition | +| `tokenize-german` | German segmentation | + +## Quick example + +```rust +use odict::{OpenDictionary, ToDictionary}; + +fn main() -> odict::Result<()> { + // Compile from XML + let xml = r#" + + + + + + + + + + "#; + + // Compile and write to disk + let dict = xml.to_dictionary()?.build()?; + dict.to_disk("example.odict")?; + + // Read from disk + let file = OpenDictionary::from_path("example.odict")?; + let contents = file.contents()?; + + // Lookup + let results = contents.lookup( + &["hello"], + &odict::lookup::LookupOptions::default(), + )?; + + println!("{:?}", results); + Ok(()) +} +``` + +## Key traits and types + +| Type | Description | +|------|-------------| +| `OpenDictionary` | A compiled dictionary loaded from disk or bytes | +| `ToDictionary` | Trait for converting XML strings to `Dictionary` | +| `Dictionary` | The deserialized dictionary schema type | +| `CompilerOptions` | Options for compiling (compression settings) | +| `lookup::LookupOptions` | Options for exact-match lookups | +| `search::SearchOptions` | Options for full-text search | +| `index::IndexOptions` | Options for creating a search index | +| `tokenize::TokenizeOptions` | Options for text tokenization | + +Refer to the [docs.rs documentation](https://docs.rs/odict) for complete details on all types, traits, and methods. diff --git a/docs/src/content/docs/cli/reference.md b/docs/src/content/docs/cli/reference.md new file mode 100644 index 00000000..641bdc26 --- /dev/null +++ b/docs/src/content/docs/cli/reference.md @@ -0,0 +1,391 @@ +--- +title: CLI Reference +description: Complete reference for the ODict command-line interface. +--- + +{/* This file is auto-generated by scripts/generate-cli-docs.mjs — do not edit manually. */} + +``` +odict [OPTIONS] +``` + +The ODict CLI is the primary tool for creating, compiling, and querying ODict dictionaries. + +## Global options + +| Option | Description | +|--------|-------------| +| `-q, --quiet` | Silence any non-important output | +| `-h, --help` | Print help | +| `-V, --version` | Print version | + +--- + +## Commands + +### `odict compile` + +Compiles a dictionary from ODXML. + +``` +odict compile [-o ] [-q ] [-w ] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `input` | Yes | Path to ODXML file | + +#### Options + +| Flag | Description | +|------|-------------| +| `-o` | Output path of compiled dictionary | +| `-q` | Brotli compression level (between 0 and 11) (default: `8`) | +| `-w` | Brotli large window size (between 0 and 22) (default: `22`) | + +--- + +### `odict download` + +Downloads a dictionary from the remote registry. + +``` +odict download [-o ] [--no-cache] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionary` | Yes | Dictionary to download (e.g., 'wiktionary/eng') | + +#### Options + +| Flag | Description | +|------|-------------| +| `-o, --output` | Directory to download the dictionary to (defaults to config directory) | +| `--no-cache` | Disable caching (always download fresh copy) (default: `false`) | +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | + +--- + +### `odict dump` + +Outputs a dictionary in a human-readable format. + +``` +odict dump [-f] [-o ] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `input` | Yes | Path to a compile dictionary | + +#### Options + +| Flag | Description | +|------|-------------| +| `-f` | Format in which to dump the dictionary. (default: `xml`) | +| `-o` | Output path of the dump. Defaults to stdout. | +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | + +--- + +### `odict index` + +Creates a full-text index of a compiled dictionary. + +``` +odict index [-d ] [-f] [-m ] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionary` | Yes | Path to a compiled dictionary or an alias | + +#### Options + +| Flag | Description | +|------|-------------| +| `-d` | Custom directory to store the index | +| `-f` | Whether to overwrite the index if it already exists (default: `false`) | +| `-m` | Memory arena per thread in bytes. Must be above 15MB. (default: `15000000`) | +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | + +--- + +### `odict info` + +Prints the metadata info for a dictionary file. + +``` +odict info +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionary_path` | Yes | Path to a compiled dictionary | + +#### Options + +| Flag | Description | +|------|-------------| +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | + +--- + +### `odict lexicon` + +Lists all words defined in a dictionary. + +``` +odict lexicon +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionary` | Yes | Path to a compiled dictionary | + +#### Options + +| Flag | Description | +|------|-------------| +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | + +--- + +### `odict lookup` + +Looks up an entry in a compiled dictionary without indexing. + +``` +odict lookup [-f ] [-F] [-s ] [-i] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionary_path` | Yes | Path to a compiled dictionary | +| `queries` | Yes | Words to look up | + +#### Options + +| Flag | Description | +|------|-------------| +| `-f, --format` | Output format of the entries (default: `print`) | +| `-F, --follow` | Follow see_also redirects until finding an entry with etymologies | +| `-s, --split` | If a definition cannot be found, attempt to split the query into words of at least length S and look up each word separately. Can be relatively slow. (default: `0`) | +| `-i, --insensitive` | Perform case-insensitive lookups (default: `false`) | +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | + +--- + +### `odict merge` + +Merge entries from multiple dictionaries into a destination dictionary. + +``` +odict merge [-o ] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `destination` | Yes | Path of the dictionary to merge into (unless --output is specified) | +| `sources` | Yes | Paths of dictionaries to merge | + +#### Options + +| Flag | Description | +|------|-------------| +| `-o, --output` | Separate output path for the compiled dictionary | +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | + +--- + +### `odict new` + +Scaffolds a new ODict XML dictionary. + +``` +odict new [-n ] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `file_name` | Yes | Name of your new dictionary file | + +#### Options + +| Flag | Description | +|------|-------------| +| `-n` | Name attribute of the dictionary element | + +--- + +### `odict search` + +Run a full-text query on a compiled dictionary. + +``` +odict search [-f] [--index] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionary` | Yes | Path to a compiled dictionary or an alias | +| `query` | Yes | Search query | + +#### Options + +| Flag | Description | +|------|-------------| +| `-f, --format` | Format in which to print the results (default: `json`) | +| `--index` | Creates a new index if one doesn't already exist (default: `false`) | +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | + +--- + +### `odict serve` + +Start a local web server to serve one or several dictionaries. + +``` +odict serve [dictionaries...] [-p ] [-c ] [-l] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionaries` | No | | + +#### Options + +| Flag | Description | +|------|-------------| +| `-p` | Port to listen on (default: `5005`) | +| `-c, --capacity` | Maximum number of dictionaries to keep in memory (default: `5`) | +| `-l, --level` | | + +#### HTTP endpoints + +When running `odict serve`, the following REST endpoints become available. All return JSON. + +##### `GET /{name}/lookup` + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `q` | string | Yes | | +| `follow` | boolean | No | | +| `split` | number | No | | + +##### `GET /{name}/search` + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `q` | string | Yes | | +| `limit` | number | No | | + +##### `GET /{name}/tokenize` + +| Parameter | Type | Required | Description | +|-----------|------|----------|-------------| +| `text` | string | Yes | | +| `follow` | boolean | No | | + +--- + +### `odict tokenize` + +Tokenize text and find dictionary entries for each token. + +``` +odict tokenize [-f ] [-F] [-i] +``` + +#### Arguments + +| Argument | Required | Description | +|----------|----------|-------------| +| `dictionary_path` | Yes | Path to a compiled dictionary | +| `text` | Yes | Text to tokenize | + +#### Options + +| Flag | Description | +|------|-------------| +| `-f, --format` | Output format of the entries (default: `print`) | +| `-F, --follow` | Follow see_also redirects until finding an entry with etymologies | +| `-i, --insensitive` | Perform case-insensitive lookups when matching tokens (default: `false`) | +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | + +--- + +### `odict alias` + +Manage dictionary aliases. + +#### `odict alias add` + +Attempts to create a new dictionary alias, failing if one already exists with the given name. + +``` +odict alias add +``` + +| Argument | Required | Description | +|----------|----------|-------------| +| `name` | Yes | Name of the alias | +| `path` | Yes | Dictionary path | + +| Flag | Description | +|------|-------------| +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | + +#### `odict alias set` + +Creates or updates an existing dictionary alias. + +``` +odict alias set +``` + +| Argument | Required | Description | +|----------|----------|-------------| +| `name` | Yes | Name of the alias | +| `path` | Yes | Dictionary path | + +| Flag | Description | +|------|-------------| +| `-r, --retries` | Number of times to retry loading the dictionary (remote-only) (default: `3`) | + +#### `odict alias delete` + +Deletes an alias with the given name if it exists. + +``` +odict alias delete +``` + +| Argument | Required | Description | +|----------|----------|-------------| +| `name` | Yes | Name of the alias | + +--- diff --git a/docs/src/content/docs/getting-started/installation.md b/docs/src/content/docs/getting-started/installation.md new file mode 100644 index 00000000..61f80d3d --- /dev/null +++ b/docs/src/content/docs/getting-started/installation.md @@ -0,0 +1,71 @@ +--- +title: Installation +description: How to install the ODict CLI and language bindings. +--- + +## CLI + +### Homebrew (macOS) + +```bash +brew install TheOpenDictionary/odict/odict +``` + +### Shell installer (macOS / Linux) + +```bash +curl --proto '=https' --tlsv1.2 -LsSf https://github.com/TheOpenDictionary/odict/releases/latest/download/odict-installer.sh | sh +``` + +### PowerShell installer (Windows) + +```powershell +powershell -ExecutionPolicy ByPass -c "irm https://github.com/TheOpenDictionary/odict/releases/latest/download/odict-installer.ps1 | iex" +``` + +### From source + +Requires [Rust](https://rustup.rs/) 1.75+. + +```bash +git clone https://github.com/TheOpenDictionary/odict.git +cd odict +cargo install --path cli +``` + +### Verify installation + +```bash +odict --version +``` + +--- + +## Language bindings + +### Python + +```bash +pip install theopendictionary +``` + +Requires Python 3.8.1+. See the [Python API docs](/api/python/) for usage. + +### JavaScript (Node.js) + +```bash +npm install @odict/node +``` + +Requires Node.js 12+. The package includes native binaries for all major platforms. See the [JavaScript API docs](/api/javascript/) for usage. + +### Rust + +Add the crate to your `Cargo.toml`: + +```toml +[dependencies] +odict = "2" +``` + +See the [Rust API docs](/api/rust/) for usage and feature flags. diff --git a/docs/src/content/docs/getting-started/introduction.md b/docs/src/content/docs/getting-started/introduction.md new file mode 100644 index 00000000..c5d86a2c --- /dev/null +++ b/docs/src/content/docs/getting-started/introduction.md @@ -0,0 +1,42 @@ +--- +title: Introduction +description: What is ODict and why does it exist? +--- + +**ODict** (The Open Dictionary) is a blazingly-fast, open-source dictionary file format designed for human languages. It provides a complete pipeline for defining, compiling, and querying dictionaries: + +1. **Define** your dictionary entries in a simple XML format (ODXML) +2. **Compile** the XML into a compact binary `.odict` file +3. **Query** the compiled dictionary using exact lookups, full-text search, or multi-language tokenization + +## Why ODict? + +Most dictionary data is locked in proprietary formats, scattered across inconsistent APIs, or stored in slow, bloated files. ODict addresses these problems: + +- **Universal schema** — A single, well-defined XML schema that can represent dictionaries for any human language, including etymologies, multiple senses, pronunciations, examples, and cross-references. +- **Fast binary format** — Compiled `.odict` files use [rkyv](https://rkyv.org/) for zero-copy deserialization and Brotli compression, making lookups extremely fast even on large dictionaries. +- **Full-text search** — Built-in indexing and search powered by [Tantivy](https://github.com/quickwit-oss/tantivy). +- **Multi-language tokenization** — Tokenize text in Chinese, Japanese, Korean, Thai, Khmer, German, Swedish, and Latin-script languages, and automatically match tokens to dictionary entries. +- **Cross-platform bindings** — Use ODict from Rust, Python, JavaScript (Node.js and browser), or through the CLI and HTTP server. + +## Architecture + +``` +┌─────────────┐ ┌──────────┐ ┌─────────────┐ +│ ODXML file │────▶│ Compiler │────▶│ .odict file │ +│ (XML) │ │ │ │ (binary) │ +└─────────────┘ └──────────┘ └──────┬──────┘ + │ + ┌───────────────────────┬┴──────────────────────┐ + │ │ │ + ┌─────▼─────┐ ┌──────▼──────┐ ┌──────▼──────┐ + │ Lookup │ │ Search │ │ Tokenize │ + │ (exact key)│ │ (full-text) │ │ (NLP-based) │ + └───────────┘ └─────────────┘ └─────────────┘ +``` + +## What's next? + +- [Install the CLI](/getting-started/installation/) to start working with dictionaries +- [Quick Start](/getting-started/quickstart/) walks you through creating and compiling your first dictionary +- Browse the [XML Schema Reference](/schema/reference/) to learn the full data model diff --git a/docs/src/content/docs/getting-started/quickstart.md b/docs/src/content/docs/getting-started/quickstart.md new file mode 100644 index 00000000..afc1023f --- /dev/null +++ b/docs/src/content/docs/getting-started/quickstart.md @@ -0,0 +1,170 @@ +--- +title: Quick Start +description: Create, compile, and query your first ODict dictionary. +--- + +This guide walks you through creating a simple dictionary, compiling it, and querying it with the CLI. + +## 1. Create a new dictionary + +Use the `odict new` command to scaffold a blank XML file: + +```bash +odict new animals -n "Animal Dictionary" +``` + +This creates `animals.xml`: + +```xml + + + +``` + +## 2. Add entries + +Open `animals.xml` and add some entries: + +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +:::tip +The `see` attribute creates a cross-reference. When you look up "kitty", ODict can follow it to the "cat" entry. +::: + +## 3. Compile the dictionary + +```bash +odict compile animals.xml +``` + +This produces `animals.odict` — a compact binary file. You can inspect it with: + +```bash +odict info animals.odict +``` + +``` +Animal Dictionary +───────────────── + +File Version: 3 +File Size: 312 B +Entries: 3 +``` + +## 4. Look up entries + +```bash +odict lookup animals.odict cat +``` + +Output: + +``` +cat (From Latin cattus) + + noun + 1. A small domesticated carnivorous mammal with soft fur + • "The cat sat on the mat." + • "She adopted two cats from the shelter." + 2. (informal) A person, especially a man + • "He's a cool cat." +``` + +### Follow cross-references + +```bash +odict lookup animals.odict kitty -F 1 +``` + +This follows the `see="cat"` redirect and returns the "cat" entry. + +### JSON output + +```bash +odict lookup animals.odict cat -f json +``` + +Returns full structured JSON, useful for integration with other tools. + +## 5. Full-text search + +To search across all definitions, first create an index: + +```bash +odict index animals.odict +``` + +Then search: + +```bash +odict search animals.odict "domesticated mammal" +``` + +This returns all entries whose definitions match the query. + +:::note +You can also pass `--index` to `odict search` to auto-create the index on the fly. +::: + +## 6. Serve over HTTP + +Start a local server to query dictionaries via REST: + +```bash +odict serve animals.odict -p 8080 +``` + +Then query from any HTTP client: + +```bash +# Lookup +curl "http://localhost:8080/animals/lookup?queries=cat,dog" + +# Search +curl "http://localhost:8080/animals/search?query=domesticated" + +# Tokenize +curl "http://localhost:8080/animals/tokenize?text=the+cat+and+the+dog" +``` + +## What's next? + +- [XML Schema Reference](/schema/reference/) — learn the full XML format including pronunciations, notes, and groups +- [CLI Reference](/cli/reference/) — complete command-line documentation +- Language bindings: [Python](/api/python/), [JavaScript](/api/javascript/), [Rust](/api/rust/) diff --git a/docs/src/content/docs/guides/compiling.mdx b/docs/src/content/docs/guides/compiling.mdx new file mode 100644 index 00000000..7b28d195 --- /dev/null +++ b/docs/src/content/docs/guides/compiling.mdx @@ -0,0 +1,242 @@ +--- +title: Compiling Dictionaries +description: How to compile ODict dictionaries programmatically from Rust, Python, and JavaScript. +--- + +import { Tabs, TabItem } from "@astrojs/starlight/components"; + +This guide shows how to compile ODXML into `.odict` binary files programmatically. For CLI usage, see the [Quick Start](/getting-started/quickstart/). + +## Compiling from an XML string + +The simplest approach is to compile an XML string directly into an in-memory dictionary. + + + +```rust +use odict::{OpenDictionary, ToDictionary}; + +fn main() -> odict::Result<()> { + let xml = r#" + + + + + + + + + + "#; + + // Parse XML → build binary → get OpenDictionary + let dict = xml.to_dictionary()?.build()?; + + // Write to disk + dict.to_disk("my-dictionary.odict")?; + + Ok(()) +} +``` + + +```python +from theopendictionary import OpenDictionary, compile + +xml = """ + + + + + + + + + +""" + +# Option 1: compile() returns raw bytes +compiled_bytes = compile(xml) +dictionary = OpenDictionary(compiled_bytes) + +# Option 2: pass XML directly to the constructor +dictionary = OpenDictionary(xml) + +# Save to disk +dictionary.save("my-dictionary.odict") +``` + + +```typescript +import { compile, OpenDictionary } from "@odict/node"; + +const xml = ` + + + + + + + + + +`; + +// Option 1: compile() returns a Buffer +const data = compile(xml); +const dictionary = new OpenDictionary(data); + +// Option 2: pass XML directly to the constructor +const dictionary = new OpenDictionary(xml); + +// Save to disk +dictionary.save("my-dictionary.odict"); +``` + + + +## Compiling from an XML file + +If your XML lives on disk, read it first and then compile. + + + +```rust +use odict::schema::Dictionary; + +fn main() -> odict::Result<()> { + // Parse and compile from a file path + let dict = Dictionary::from_path("my-dictionary.xml")? + .build()?; + + dict.to_disk("my-dictionary.odict")?; + + Ok(()) +} +``` + + +```python +from theopendictionary import OpenDictionary, compile + +with open("my-dictionary.xml", "r") as f: + xml = f.read() + +compiled_bytes = compile(xml) +dictionary = OpenDictionary(compiled_bytes) +dictionary.save("my-dictionary.odict") +``` + + +```typescript +import { readFile } from "node:fs/promises"; +import { compile, OpenDictionary } from "@odict/node"; + +const xml = await readFile("my-dictionary.xml", "utf-8"); +const data = compile(xml); +const dictionary = new OpenDictionary(data); +dictionary.save("my-dictionary.odict"); +``` + + + +## Compression options + +ODict uses Brotli compression. You can configure the compression level when saving. + + + +```rust +use odict::{compile::CompilerOptions, CompressOptions, ToDictionary}; + +fn main() -> odict::Result<()> { + let xml = std::fs::read_to_string("my-dictionary.xml")?; + + let compress = CompressOptions::default() + .quality(11) // Maximum compression (0–11) + .window_size(22); // Window size (0–22) + + let options = CompilerOptions::default() + .with_compression(compress); + + xml.as_str() + .to_dictionary()? + .build()? + .to_disk_with_options("my-dictionary.odict", options)?; + + Ok(()) +} +``` + + +```python +dictionary.save( + "my-dictionary.odict", + quality=11, # Maximum compression (0–11) + window_size=22 # Window size (0–22) +) +``` + + +```typescript +dictionary.save("my-dictionary.odict", { + compress: { + quality: 11, // Maximum compression (0–11) + windowSize: 22, // Window size (0–22) + }, +}); +``` + + + +## Loading compiled dictionaries + +Once compiled, you can load `.odict` files from disk or from the remote registry. + + + +```rust +use odict::OpenDictionary; + +fn main() -> odict::Result<()> { + // Load from disk + let file = OpenDictionary::from_path("my-dictionary.odict")?; + let dict = file.contents()?; + + println!("Entries: {}", dict.entries.len()); + + Ok(()) +} +``` + + +```python +import asyncio +from theopendictionary import OpenDictionary + +async def main(): + # Load from disk + dictionary = await OpenDictionary.load("./my-dictionary.odict") + + # Load from remote registry + dictionary = await OpenDictionary.load("wiktionary/eng") + + print(dictionary.lexicon()) + +asyncio.run(main()) +``` + + +```typescript +import { OpenDictionary } from "@odict/node"; + +// Load from disk +const dictionary = await OpenDictionary.load("./my-dictionary.odict"); + +// Load from remote registry +const dictionary = await OpenDictionary.load("wiktionary/eng"); + +console.log(dictionary.lexicon()); +``` + + diff --git a/docs/src/content/docs/guides/lookup.mdx b/docs/src/content/docs/guides/lookup.mdx new file mode 100644 index 00000000..cbcea0aa --- /dev/null +++ b/docs/src/content/docs/guides/lookup.mdx @@ -0,0 +1,312 @@ +--- +title: Looking Up Entries +description: How to look up dictionary entries by exact match from Rust, Python, and JavaScript. +--- + +import { Tabs, TabItem } from "@astrojs/starlight/components"; + +Lookup is the fastest way to query a dictionary — it finds entries by exact term match without requiring an index. + +## Basic lookup + + + +```rust +use odict::{OpenDictionary, lookup::LookupOptions}; + +fn main() -> odict::Result<()> { + let file = OpenDictionary::from_path("my-dictionary.odict")?; + let dict = file.contents()?; + + let results = dict.lookup( + &vec!["cat"], + LookupOptions::default(), + )?; + + for result in &results { + println!("{}", result.entry.term.as_str()); + } + + Ok(()) +} +``` + + +```python +from theopendictionary import OpenDictionary, compile + +dictionary = OpenDictionary("...") + +results = dictionary.lookup("cat") +print(results[0].entry.term) # "cat" +``` + + +```typescript +import { OpenDictionary } from "@odict/node"; + +const dictionary = await OpenDictionary.load("./my-dictionary.odict"); + +const results = dictionary.lookup("cat"); +console.log(results[0].entry.term); // "cat" +``` + + + +## Looking up multiple terms + +You can look up several terms in a single call. Results are returned for each matched term. + + + +```rust +let results = dict.lookup( + &vec!["cat", "dog", "run"], + LookupOptions::default(), +)?; + +for result in &results { + println!("Found: {}", result.entry.term.as_str()); +} +``` + + +```python +results = dictionary.lookup(["cat", "dog", "run"]) + +for result in results: + print(f"Found: {result.entry.term}") +``` + + +```typescript +const results = dictionary.lookup(["cat", "dog", "run"]); + +for (const result of results) { + console.log(`Found: ${result.entry.term}`); +} +``` + + + +## Following cross-references + +Entries can redirect to other entries using the `see` attribute (e.g. "ran" → "run"). Enable `follow` to automatically resolve these. + + + +```rust +use odict::lookup::LookupOptions; + +let options = LookupOptions::default().follow(true); + +let results = dict.lookup(&vec!["ran"], options)?; + +// "ran" redirects to "run" +assert_eq!(results[0].entry.term.as_str(), "run"); + +// directed_from tells you the original entry +if let Some(from) = &results[0].directed_from { + println!("Redirected from: {}", from.term.as_str()); +} +``` + + +```python +results = dictionary.lookup("ran", follow=True) + +# "ran" redirects to "run" +print(results[0].entry.term) # "run" +print(results[0].directed_from.term) # "ran" +``` + + +```typescript +const results = dictionary.lookup("ran", { follow: true }); + +// "ran" redirects to "run" +console.log(results[0].entry.term); // "run" +console.log(results[0].directedFrom?.term); // "ran" +``` + + + +:::tip +When `follow` is enabled, ODict walks the `see` chain until it finds an entry with etymologies. It also detects circular references and returns an error instead of looping infinitely. +::: + +## Case-insensitive lookup + +By default, lookups are case-sensitive. Enable `insensitive` to fall back to lowercase matching when the exact case doesn't match. + + + +```rust +let options = LookupOptions::default().insensitive(true); + +// "CAT" will match "cat" +let results = dict.lookup(&vec!["CAT"], options)?; + +assert_eq!(results[0].entry.term.as_str(), "cat"); +``` + + +```python +# "CAT" will match "cat" +results = dictionary.lookup("CAT", insensitive=True) + +print(results[0].entry.term) # "cat" +``` + + +```typescript +// "CAT" will match "cat" +const results = dictionary.lookup("CAT", { insensitive: true }); + +console.log(results[0].entry.term); // "cat" +``` + + + +## Compound word splitting + +If a term isn't found, ODict can split it into substrings and look up each part. The `split` parameter sets the minimum character length for each fragment. + + + +```rust +use odict::lookup::{LookupOptions, LookupStrategy}; + +let options = LookupOptions::default() + .strategy(LookupStrategy::Split(3)); + +// "catdog" isn't a word, but "cat" and "dog" are +let results = dict.lookup(&vec!["catdog"], options)?; + +for result in &results { + println!("Found: {}", result.entry.term.as_str()); +} +// Prints: "cat", "dog" +``` + + +```python +# "catdog" isn't a word, but "cat" and "dog" are +results = dictionary.lookup("catdog", split=3) + +for result in results: + print(result.entry.term) +# Prints: "cat", "dog" +``` + + +```typescript +// "catdog" isn't a word, but "cat" and "dog" are +const results = dictionary.lookup("catdog", { split: 3 }); + +for (const result of results) { + console.log(result.entry.term); +} +// Prints: "cat", "dog" +``` + + + +## Combining options + +All lookup options can be combined. + + + +```rust +let options = LookupOptions::default() + .follow(true) + .insensitive(true) + .strategy(LookupStrategy::Split(3)); + +let results = dict.lookup(&vec!["RaN"], options)?; +``` + + +```python +results = dictionary.lookup("RaN", follow=True, insensitive=True, split=3) +``` + + +```typescript +const results = dictionary.lookup("RaN", { + follow: true, + insensitive: true, + split: 3, +}); +``` + + + +## Reading entry data + +Once you have a `LookupResult`, you can traverse the entry's structure: etymologies, senses, definitions, examples, and more. + + + +```python +results = dictionary.lookup("cat") +entry = results[0].entry + +print(f"Term: {entry.term}") + +for ety in entry.etymologies: + for sense in ety.senses: + print(f" Part of speech: {sense.pos}") + for defn in sense.definitions: + print(f" {defn.value}") + for example in defn.examples: + print(f" e.g. {example.value}") +``` + + +```typescript +const results = dictionary.lookup("cat"); +const entry = results[0].entry; + +console.log(`Term: ${entry.term}`); + +for (const ety of entry.etymologies) { + for (const sense of ety.senses) { + console.log(` Part of speech: ${sense.pos.value}`); + for (const defn of sense.definitions) { + if ("value" in defn) { + console.log(` ${defn.value}`); + for (const example of defn.examples) { + console.log(` e.g. ${example.value}`); + } + } + } + } +} +``` + + +```rust +let results = dict.lookup(&vec!["cat"], LookupOptions::default())?; + +for result in &results { + let entry = result.entry.deserialize()?; + + println!("Term: {}", entry.term); + + for ety in &entry.etymologies { + for (pos, sense) in &ety.senses { + println!(" Part of speech: {}", pos); + for defn in &sense.definitions { + println!(" {}", defn.value); + for example in &defn.examples { + println!(" e.g. {}", example.value); + } + } + } + } +} +``` + + diff --git a/docs/src/content/docs/guides/search.mdx b/docs/src/content/docs/guides/search.mdx new file mode 100644 index 00000000..86d64c80 --- /dev/null +++ b/docs/src/content/docs/guides/search.mdx @@ -0,0 +1,176 @@ +--- +title: Searching Dictionaries +description: How to index and run full-text searches on ODict dictionaries from Rust, Python, and JavaScript. +--- + +import { Tabs, TabItem } from "@astrojs/starlight/components"; + +Full-text search lets you find entries by matching against their definitions, not just their headwords. Unlike [lookup](/guides/lookup/) which requires an exact term match, search uses a [Tantivy](https://github.com/quickwit-oss/tantivy)-powered full-text index. + +## Creating an index + +Before you can search, you need to create a full-text index. This only needs to be done once per dictionary (the index is persisted to disk). + + + +```rust +use odict::{OpenDictionary, index::IndexOptions}; + +fn main() -> odict::Result<()> { + let file = OpenDictionary::from_path("my-dictionary.odict")?; + let dict = file.contents()?; + + dict.index(IndexOptions::default())?; + + Ok(()) +} +``` + + +```python +from theopendictionary import OpenDictionary + +dictionary = await OpenDictionary.load("./my-dictionary.odict") +dictionary.index() +``` + + +```typescript +import { OpenDictionary } from "@odict/node"; + +const dictionary = await OpenDictionary.load("./my-dictionary.odict"); +dictionary.index(); +``` + + + +## Index options + +You can configure the indexing behavior. + + + +```rust +use odict::index::IndexOptions; + +let options = IndexOptions::default() + .dir("./my-index") // Custom index directory + .overwrite(true) // Overwrite existing index + .memory(50_000_000); // 50MB memory arena per thread + +dict.index(options)?; +``` + + +```python +from theopendictionary import IndexOptions + +dictionary.index(IndexOptions( + directory="./my-index", # Custom index directory + overwrite=True, # Overwrite existing index + memory=50_000_000 # 50MB memory arena per thread +)) +``` + + +```typescript +dictionary.index({ + directory: "./my-index", // Custom index directory + overwrite: true, // Overwrite existing index + memory: 50_000_000, // 50MB memory arena per thread +}); +``` + + + +## Running a search + +Once indexed, you can search across all definitions in the dictionary. + + + +```rust +use odict::search::SearchOptions; + +let results = dict.search("domesticated mammal", SearchOptions::default())?; + +for entry in &results { + println!("{}", entry.term); +} +``` + + +```python +results = dictionary.search("domesticated mammal") + +for entry in results: + print(entry.term) +``` + + +```typescript +const results = dictionary.search("domesticated mammal"); + +for (const entry of results) { + console.log(entry.term); +} +``` + + + +## Search options + + + +```rust +use odict::search::SearchOptions; + +let options = SearchOptions::default() + .dir("./my-index") // Custom index directory + .autoindex(true) // Auto-create index if missing + .limit(10) // Max results to return + .threshold(50); // Relevance threshold + +let results = dict.search("greeting", options)?; +``` + + +```python +from theopendictionary import SearchOptions + +results = dictionary.search("greeting", SearchOptions( + directory="./my-index", # Custom index directory + autoindex=True, # Auto-create index if missing + limit=10, # Max results to return + threshold=50 # Relevance threshold +)) +``` + + +```typescript +const results = dictionary.search("greeting", { + directory: "./my-index", // Custom index directory + autoindex: true, // Auto-create index if missing + limit: 10, // Max results to return + threshold: 50, // Relevance threshold +}); +``` + + + +:::tip +The `autoindex` option is convenient for one-off scripts — it creates the index on the fly if one doesn't exist yet. For production use, create the index ahead of time with `index()` to avoid the startup cost on first search. +::: + +## Search vs. lookup + +| | Lookup | Search | +|---|--------|--------| +| **Matches against** | Entry terms (headwords) | Definition text | +| **Requires index** | No | Yes | +| **Speed** | O(1) per term | Depends on index size | +| **Use case** | You know the exact word | You're searching by meaning | +| **Supports splitting** | Yes | No | +| **Supports follow** | Yes | No | + +In most applications you'll use both: lookup for direct dictionary access, and search for discovery. diff --git a/docs/src/content/docs/guides/tokenize.mdx b/docs/src/content/docs/guides/tokenize.mdx new file mode 100644 index 00000000..22df8303 --- /dev/null +++ b/docs/src/content/docs/guides/tokenize.mdx @@ -0,0 +1,228 @@ +--- +title: Tokenizing Text +description: How to tokenize text and match tokens against dictionary entries using ODict's NLP tokenizer. +--- + +import { Tabs, TabItem } from "@astrojs/starlight/components"; + +ODict includes a built-in NLP tokenizer that segments text into words and automatically matches each token against dictionary entries. This is especially useful for languages without whitespace-delimited words (Chinese, Japanese, Korean, Thai, Khmer) as well as compound-word languages (German, Swedish). + +## Supported languages + +| Language family | Languages | Tokenizer | +|----------------|-----------|-----------| +| Chinese | Simplified & Traditional Chinese | jieba | +| Japanese | Japanese | Lindera (UniDic) | +| Korean | Korean | Lindera (KoDic) | +| Thai | Thai | ICU-based | +| Khmer | Khmer | ICU-based | +| Germanic | German, Swedish | Compound word splitting | +| Latin-script | English, French, Spanish, etc. | Unicode word boundaries | + +## Basic tokenization + + + +```rust +use odict::{OpenDictionary, tokenize::TokenizeOptions}; + +fn main() -> odict::Result<()> { + let file = OpenDictionary::from_path("my-dictionary.odict")?; + let dict = file.contents()?; + + let tokens = dict.tokenize( + "the cat ran", + TokenizeOptions::default(), + )?; + + for token in &tokens { + println!("'{}' ({} entries found)", + token.lemma, + token.entries.len() + ); + } + + Ok(()) +} +``` + + +```python +from theopendictionary import OpenDictionary + +dictionary = OpenDictionary("...") + +tokens = dictionary.tokenize("the cat ran") + +for token in tokens: + print(f"'{token.lemma}' ({len(token.entries)} entries found)") +``` + + +```typescript +import { OpenDictionary } from "@odict/node"; + +const dictionary = await OpenDictionary.load("./my-dictionary.odict"); + +const tokens = dictionary.tokenize("the cat ran"); + +for (const token of tokens) { + console.log(`'${token.lemma}' (${token.entries.length} entries found)`); +} +``` + + + +## Chinese text tokenization + +For Chinese (and other CJK languages), ODict automatically detects the script and uses the appropriate segmenter. + + + +```rust +let tokens = dict.tokenize("你好世界", TokenizeOptions::default())?; + +for token in &tokens { + println!("Lemma: {}, Script: {:?}, Language: {:?}", + token.lemma, + token.script.name(), + token.language.as_ref().map(|l| l.code()) + ); +} +``` + + +```python +tokens = dictionary.tokenize("你好世界") + +for token in tokens: + print(f"Lemma: {token.lemma}, Script: {token.script}, Language: {token.language}") +``` + + +```typescript +const tokens = dictionary.tokenize("你好世界"); + +for (const token of tokens) { + console.log(`Lemma: ${token.lemma}, Script: ${token.script}, Language: ${token.language}`); +} +``` + + + +## Following cross-references + +Like [lookup](/guides/lookup/), tokenization supports following `see` cross-references. + + + +```rust +let options = TokenizeOptions::default().follow(true); + +let tokens = dict.tokenize("the cat ran", options)?; + +for token in &tokens { + for result in &token.entries { + if let Some(from) = &result.directed_from { + println!("'{}' → '{}'", + from.term.as_str(), + result.entry.term.as_str() + ); + } + } +} +// e.g. 'ran' → 'run' +``` + + +```python +tokens = dictionary.tokenize("the cat ran", follow=True) + +for token in tokens: + for result in token.entries: + if result.directed_from: + print(f"'{result.directed_from.term}' → '{result.entry.term}'") +# e.g. 'ran' → 'run' +``` + + +```typescript +const tokens = dictionary.tokenize("the cat ran", { follow: true }); + +for (const token of tokens) { + for (const result of token.entries) { + if (result.directedFrom) { + console.log(`'${result.directedFrom.term}' → '${result.entry.term}'`); + } + } +} +// e.g. 'ran' → 'run' +``` + + + +## Case-insensitive tokenization + + + +```rust +let options = TokenizeOptions::default().insensitive(true); + +// "DOG" will match the "dog" entry +let tokens = dict.tokenize("DOG cat", options)?; +``` + + +```python +# "DOG" will match the "dog" entry +tokens = dictionary.tokenize("DOG cat", insensitive=True) +``` + + +```typescript +// "DOG" will match the "dog" entry +const tokens = dictionary.tokenize("DOG cat", { insensitive: true }); +``` + + + +## Token properties + +Each token returned by `tokenize()` includes metadata about the match. + +| Property | Description | +|----------|-------------| +| `lemma` | The original text of the token as it appears in the input | +| `language` | Detected language code (e.g. `"cmn"` for Mandarin), if applicable | +| `script` | Detected script name (e.g. `"Han"`, `"Latin"`) | +| `kind` | Token kind (e.g. `"Word"`, `"Punctuation"`) | +| `start` | Start byte offset in the original text | +| `end` | End byte offset in the original text | +| `entries` | Array of `LookupResult` objects for matched dictionary entries | + +## Combining options + + + +```rust +let options = TokenizeOptions::default() + .follow(true) + .insensitive(true); + +let tokens = dict.tokenize("The CAT RaN away", options)?; +``` + + +```python +tokens = dictionary.tokenize("The CAT RaN away", follow=True, insensitive=True) +``` + + +```typescript +const tokens = dictionary.tokenize("The CAT RaN away", { + follow: true, + insensitive: true, +}); +``` + + diff --git a/docs/src/content/docs/index.mdx b/docs/src/content/docs/index.mdx new file mode 100644 index 00000000..55838e4a --- /dev/null +++ b/docs/src/content/docs/index.mdx @@ -0,0 +1,38 @@ +--- +title: ODict +description: The lightning-fast open-source dictionary file format for human languages. +template: splash +hero: + title: ODict + tagline: The lightning-fast open-source dictionary file format for human languages. + actions: + - text: Get Started + link: /getting-started/introduction/ + icon: right-arrow + - text: View on GitHub + link: https://github.com/TheOpenDictionary/odict + icon: external + variant: minimal +--- + +import { Card, CardGrid } from "@astrojs/starlight/components"; + + + + Define your dictionary entries using a simple, well-documented XML schema + (ODXML) that supports etymologies, senses, definitions, examples, + pronunciations, and more. + + + Compile your XML dictionaries into compact, blazingly-fast binary `.odict` + files using zero-copy deserialization via rkyv and Brotli compression. + + + Index and search your compiled dictionaries with built-in full-text search + powered by Tantivy, with multi-language tokenization support. + + + Native bindings for Python, JavaScript (Node.js and browser via WASI), and + Rust. Plus a powerful CLI and HTTP server for language-agnostic access. + + diff --git a/docs/src/content/docs/schema/overview.md b/docs/src/content/docs/schema/overview.md new file mode 100644 index 00000000..e309d380 --- /dev/null +++ b/docs/src/content/docs/schema/overview.md @@ -0,0 +1,174 @@ +--- +title: Schema Overview +description: An overview of the ODict XML (ODXML) schema and how dictionaries are structured. +--- + +ODict dictionaries are authored in XML using the **ODXML** (Open Dictionary XML) format. This page provides a conceptual overview of how the schema is structured. For the full element-by-element reference, see the [Schema Reference](/schema/reference/). + +## Structure + +An ODXML file describes a dictionary as a hierarchy: + +``` +dictionary +└── entry (one per headword) + ├── pronunciation (optional, entry-level) + └── ety (etymology — groups senses by word origin) + └── sense (groups definitions by part of speech) + ├── group (optional grouping of definitions) + │ └── definition + │ ├── example + │ └── note + └── definition + ├── example + └── note +``` + +## Minimal example + +The simplest valid dictionary: + +```xml + + + + + + + + + +``` + +## Entries and cross-references + +Each `` represents a headword. Entries can either contain full definitions (via `` children) or redirect to another entry using the `see` attribute: + +```xml + + + + + + + + + + +``` + +When looking up "ran" with the `follow` option enabled, ODict will resolve the cross-reference and return the "run" entry. + +## Etymologies + +If a word has multiple distinct origins, you can define multiple `` elements: + +```xml + + + + + + + + + + + + +``` + +## Senses and parts of speech + +Within an etymology, `` elements group definitions by part of speech. The `pos` attribute accepts standard codes like `n` (noun), `v` (verb), `adj` (adjective), etc. See the [reference](/schema/reference/#parts-of-speech) for the full list. + +```xml + + + + + + +``` + +If the part of speech is unknown or not applicable, you can omit `pos` entirely. + +## Definition groups + +When a sense has many definitions, you can organize them with ``: + +```xml + + + + + + + + + + +``` + +## Examples and notes + +Definitions can have `` and `` children: + +```xml + + + + + + + +``` + +## Pronunciations + +Pronunciations can be attached at the entry level and support any phonetic system: + +```xml + + + + + + + + + + + + + + + + + +``` + +This is especially useful for non-Latin scripts: + +```xml + + + + ... + +``` + +## XSD validation + +The schema is formally defined in [`odict.xsd`](https://github.com/TheOpenDictionary/odict/blob/main/odict.xsd). You can validate your XML against it: + +```xml + + + ... + +``` + +Most XML editors (VS Code with the XML extension, IntelliJ, etc.) will provide autocomplete and validation when the XSD is referenced. diff --git a/docs/src/content/docs/schema/reference.md b/docs/src/content/docs/schema/reference.md new file mode 100644 index 00000000..0d78b7c6 --- /dev/null +++ b/docs/src/content/docs/schema/reference.md @@ -0,0 +1,328 @@ +--- +title: XML Schema Reference +description: Complete reference for the ODict XML (ODXML) schema. +--- + +{/* This file is auto-generated by scripts/generate-schema-docs.mjs — do not edit manually. */} + +This page is automatically generated from [`odict.xsd`](https://github.com/TheOpenDictionary/odict/blob/main/odict.xsd) and [`pos.rs`](https://github.com/TheOpenDictionary/odict/blob/main/lib/src/schema/pos.rs). + +## Element hierarchy + +``` +dictionary +└── entry + ├── pronunciation + │ └── url + └── ety + └── sense + ├── group + │ └── definition + │ ├── example + │ │ └── pronunciation … + │ └── note + │ └── example … + └── definition … +``` + +--- + +## Elements + +### `` + +#### Attributes + +| Attribute | Type | Required | +|-----------|------|----------| +| `id` | `string` | No | +| `name` | `string` | No | + +#### Child elements + +| Element | Min | Max | +|---------|-----|-----| +| [``](#entry) | 1 | unbounded | + +--- + +### `` + +#### Attributes + +| Attribute | Type | Required | +|-----------|------|----------| +| `term` | `string` | Yes | +| `see` | `string` | No | + +#### Child elements + +| Element | Min | Max | +|---------|-----|-----| +| [``](#pronunciation) | 0 | unbounded | +| [``](#ety) | 0 | unbounded | + +--- + +### `` + +#### Attributes + +| Attribute | Type | Required | +|-----------|------|----------| +| `id` | `string` | No | +| `pronunciation` | `string` | No | +| `description` | `string` | No | + +#### Child elements + +| Element | Min | Max | +|---------|-----|-----| +| [``](#sense) | 1 | unbounded | + +--- + +### `` + +#### Attributes + +| Attribute | Type | Required | +|-----------|------|----------| +| `pos` | `string` | No | + +#### Child elements + +| Element | Min | Max | +|---------|-----|-----| +| [``](#group) | 0 | unbounded | +| [``](#definition) | 0 | unbounded | + +--- + +### `` + +#### Attributes + +| Attribute | Type | Required | +|-----------|------|----------| +| `id` | `string` | No | +| `description` | `string` | No | + +#### Child elements + +| Element | Min | Max | +|---------|-----|-----| +| [``](#definition) | 1 | unbounded | + +--- + +### `` + +#### Attributes + +| Attribute | Type | Required | +|-----------|------|----------| +| `id` | `string` | No | +| `value` | `string` | Yes | + +#### Child elements + +| Element | Min | Max | +|---------|-----|-----| +| [``](#example) | 0 | unbounded | +| [``](#note) | 0 | unbounded | + +--- + +### `` + +#### Attributes + +| Attribute | Type | Required | +|-----------|------|----------| +| `id` | `string` | No | +| `value` | `string` | Yes | + +#### Child elements + +| Element | Min | Max | +|---------|-----|-----| +| [``](#example) | 1 | unbounded | + +--- + +### `` + +#### Attributes + +| Attribute | Type | Required | +|-----------|------|----------| +| `value` | `string` | Yes | + +#### Child elements + +| Element | Min | Max | +|---------|-----|-----| +| [``](#pronunciation) | 0 | unbounded | + +--- + +### `` + +#### Attributes + +| Attribute | Type | Required | +|-----------|------|----------| +| `kind` | `string` | Yes | +| `value` | `string` | Yes | + +#### Child elements + +| Element | Min | Max | +|---------|-----|-----| +| [``](#url) | 0 | unbounded | + +--- + +### `` + +#### Attributes + +| Attribute | Type | Required | +|-----------|------|----------| +| `src` | `string` | Yes | +| `type` | `string` | No | +| `description` | `string` | No | + +--- + +## Parts of speech + +The `pos` attribute on `` accepts the following values. You can also pass any custom string, which will be treated as a custom part of speech. + +### Universal + +| Code | Label | +|------|-------| +| `art` | article | +| `abv` | abbreviation | +| `adf` | adfix | +| `adj` | adjective | +| `phr_adj` | adjective phrase | +| `adv` | adverb | +| `phr_adv` | adverbial phrase | +| `aff` | affix | +| `aux` | auxiliary | +| `aux_adj` | auxiliary adjective | +| `aux_v` | auxiliary verb | +| `chr` | character | +| `cf` | circumfix | +| `cls` | classifier | +| `conj` | conjunction | +| `conj_c` | coordinating conjunction | +| `contr` | contraction | +| `cop` | copula | +| `ctr` | counter | +| `det` | determiner | +| `expr` | expression | +| `inf` | infix | +| `intf` | interfix | +| `intj` | interjection | +| `vi` | intransitive verb | +| `name` | name | +| `n` | noun | +| `num` | numeric | +| `part` | particle | +| `phr` | phrase | +| `postp` | postposition | +| `pref` | prefix | +| `prep` | preposition | +| `phr_prep` | prepositional phrase | +| `pron` | pronoun | +| `propn` | proper noun | +| `prov` | proverb | +| `punc` | punctuation | +| `conj_s` | subordinating conjunction | +| `suff` | suffix | +| `sym` | symbol | +| `vt` | transitive verb | +| `un` | unknown | +| `v` | verb | + +### Japanese-specific + +| Code | Label | +|------|-------| +| `adj_pn` | pre-noun adjectival (rentaishi) | +| `adj_kari` | 'kari' adjective (archaic) | +| `adj_ku` | 'ku' adjective (archaic) | +| `adj_nari` | archaic/formal form of na-adjective | +| `adj_na` | adjectival nouns or quasi-adjectives (keiyodoshi) | +| `adj_shiku` | 'shiku' adjective (archaic) | +| `adj_t` | 'taru' adjective | +| `adj_ix` | adjective (keiyoushi) - yoi/ii class | +| `n_adv` | adverbial noun (fukushitekimeishi) | +| `adv_to` | adverb taking the 'to' particle | +| `adj_no` | nouns which may take the genitive case particle 'no' | +| `n_pref` | noun, used as a prefix | +| `n_suf` | noun, used as a suffix | +| `nt` | noun (temporal) (jisoumeishi) | +| `adj_f` | noun or verb acting prenominally | +| `v5b` | Godan verb with 'bu' ending | +| `v5g` | Godan verb with 'gu' ending | +| `v5k` | Godan verb with 'ku' ending | +| `v5m` | Godan verb with 'mu' ending | +| `v5n` | Godan verb with 'nu' ending | +| `v5r` | Godan verb with 'ru' ending | +| `v5r_i` | Godan verb with 'ru' ending (irregular verb) | +| `v5aru` | Godan verb - -aru special class | +| `v5k_s` | Godan verb - Iku/Yuku special class | +| `v5s` | Godan verb with 'su' ending | +| `v5t` | Godan verb with 'tsu' ending | +| `v5u` | Godan verb with 'u' ending | +| `v5uru` | Godan verb - Uru old class verb (old form of Eru) | +| `v5u_s` | Godan verb with 'u' ending (special class) | +| `v1` | Ichidan verb | +| `v1s` | Ichidan verb - kureru special class | +| `vz` | Ichidan verb - zuru verb (alternative form of -jiru verbs) | +| `vk` | Kuru verb - special class | +| `v2b_s` | Nidan verb (lower class) with 'bu' ending (archaic) | +| `v2b_k` | Nidan verb (upper class) with 'bu' ending (archaic) | +| `v2d_s` | Nidan verb (lower class) with 'dzu' ending (archaic) | +| `v2d_k` | Nidan verb (upper class) with 'dzu' ending (archaic) | +| `v2g_s` | Nidan verb (lower class) with 'gu' ending (archaic) | +| `v2g_k` | Nidan verb (upper class) with 'gu' ending (archaic) | +| `v2h_s` | Nidan verb (lower class) with 'hu/fu' ending (archaic) | +| `v2h_k` | Nidan verb (upper class) with 'hu/fu' ending (archaic) | +| `v2k_s` | Nidan verb (lower class) with 'ku' ending (archaic) | +| `v2k_k` | Nidan verb (upper class) with 'ku' ending (archaic) | +| `v2m_s` | Nidan verb (lower class) with 'mu' ending (archaic) | +| `v2m_k` | Nidan verb (upper class) with 'mu' ending (archaic) | +| `v2n_s` | Nidan verb (lower class) with 'nu' ending (archaic) | +| `v2r_s` | Nidan verb (lower class) with 'ru' ending (archaic) | +| `v2r_k` | Nidan verb (upper class) with 'ru' ending (archaic) | +| `v2s_s` | Nidan verb (lower class) with 'su' ending (archaic) | +| `v2t_s` | Nidan verb (lower class) with 'tsu' ending (archaic) | +| `v2t_k` | Nidan verb (upper class) with 'tsu' ending (archaic) | +| `v2a_s` | Nidan verb with 'u' ending (archaic) | +| `v2w_s` | Nidan verb (lower class) with 'u' ending and 'we' conjugation (archaic) | +| `v2y_s` | Nidan verb (lower class) with 'yu' ending (archaic) | +| `v2y_k` | Nidan verb (upper class) with 'yu' ending (archaic) | +| `v2z_s` | Nidan verb (lower class) with 'zu' ending (archaic) | +| `vn` | irregular nu verb | +| `vr` | irregular ru verb, plain form ends with -ri | +| `vs_c` | su verb - precursor to the modern suru | +| `vs` | noun or participle which takes the aux. verb suru | +| `vs_i` | suru verb - included | +| `vs_s` | suru verb - special class | +| `v_unspec` | verb unspecified | +| `v4b` | Yodan verb with 'bu' ending (archaic) | +| `v4g` | Yodan verb with 'gu' ending (archaic) | +| `v4h` | Yodan verb with 'hu/fu' ending (archaic) | +| `v4k` | Yodan verb with 'ku' ending (archaic) | +| `v4m` | Yodan verb with 'mu' ending (archaic) | +| `v4n` | Yodan verb with 'nu' ending (archaic) | +| `v4r` | Yodan verb with 'ru' ending (archaic) | +| `v4s` | Yodan verb with 'su' ending (archaic) | +| `v4t` | Yodan verb with 'tsu' ending (archaic) | + diff --git a/docs/tsconfig.json b/docs/tsconfig.json new file mode 100644 index 00000000..bcbf8b50 --- /dev/null +++ b/docs/tsconfig.json @@ -0,0 +1,3 @@ +{ + "extends": "astro/tsconfigs/strict" +} diff --git a/lib/src/core/compile.rs b/lib/src/core/compile.rs index 0f2c8c34..d7f92a31 100644 --- a/lib/src/core/compile.rs +++ b/lib/src/core/compile.rs @@ -1,3 +1,45 @@ +//! Dictionary compilation and binary serialization. +//! +//! This module provides functionality to compile dictionary data structures into +//! the ODict binary format. The compilation process involves serialization, +//! compression, and packaging with metadata headers. +//! +//! # Binary Format Structure +//! +//! The ODict binary format consists of: +//! 1. **Signature** (5 bytes): "ODICT" magic bytes +//! 2. **Version Length** (8 bytes): Length of version string +//! 3. **Version** (variable): Semantic version string +//! 4. **Content Length** (8 bytes): Length of compressed content +//! 5. **Content** (variable): Compressed serialized dictionary data +//! +//! # Examples +//! +//! ## Basic Compilation +//! +//! ```rust +//! use odict::{Dictionary, CompilerOptions}; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! let compiled = dict.build()?; +//! let bytes = compiled.to_bytes()?; +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Compilation with Custom Compression +//! +//! ```rust +//! use odict::{Dictionary, CompilerOptions, CompressOptions}; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! let compiled = dict.build()?; +//! +//! let options = CompilerOptions::default() +//! .with_compression(CompressOptions::default()); +//! let bytes = compiled.to_bytes_with_options(options)?; +//! # Ok::<(), Box>(()) +//! ``` + use crate::compress::{compress, CompressOptions}; use crate::error::Error; use crate::schema::Dictionary; @@ -5,8 +47,13 @@ use crate::OpenDictionary; use super::consts::{SIGNATURE, VERSION}; +/// Configuration options for dictionary compilation. +/// +/// This struct allows customization of the compilation process, particularly +/// compression settings that affect the final binary size and performance. #[derive(Default)] pub struct CompilerOptions { + /// Compression options to use during compilation. pub compress_options: CompressOptions, } @@ -17,6 +64,20 @@ impl AsRef for CompilerOptions { } impl CompilerOptions { + /// Set custom compression options for the compilation process. + /// + /// # Arguments + /// + /// * `compress_options` - The compression configuration to use + /// + /// # Examples + /// + /// ```rust + /// use odict::{CompilerOptions, CompressOptions}; + /// + /// let options = CompilerOptions::default() + /// .with_compression(CompressOptions::default()); + /// ``` pub fn with_compression(mut self, compress_options: CompressOptions) -> Self { self.compress_options = compress_options; self @@ -24,10 +85,70 @@ impl CompilerOptions { } impl OpenDictionary { + /// Convert the dictionary to binary format using default compilation options. + /// + /// This method serializes the dictionary into the ODict binary format, + /// applying default compression and packaging it with the appropriate headers. + /// + /// # Returns + /// + /// A `Vec` containing the complete binary representation of the dictionary. + /// + /// # Errors + /// + /// Returns an error if: + /// - Compression fails + /// - Serialization fails + /// - Binary format validation fails + /// + /// # Examples + /// + /// ```rust + /// use odict::{Dictionary, OpenDictionary}; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// let compiled = dict.build()?; + /// let bytes = compiled.to_bytes()?; + /// # Ok::<(), Box>(()) + /// ``` pub fn to_bytes(&self) -> crate::Result> { self.to_bytes_with_options(CompilerOptions::default()) } + /// Convert the dictionary to binary format with custom compilation options. + /// + /// This method provides fine-grained control over the compilation process, + /// allowing customization of compression settings and other options. + /// + /// # Arguments + /// + /// * `options` - Compilation options to customize the process + /// + /// # Returns + /// + /// A `Vec` containing the complete binary representation of the dictionary. + /// + /// # Errors + /// + /// Returns an error if: + /// - Compression fails with the specified options + /// - Serialization fails + /// - Binary format validation fails + /// - Header construction fails + /// + /// # Examples + /// + /// ```rust + /// use odict::{Dictionary, OpenDictionary, CompilerOptions, CompressOptions}; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// let compiled = dict.build()?; + /// + /// let options = CompilerOptions::default() + /// .with_compression(CompressOptions::default()); + /// let bytes = compiled.to_bytes_with_options(options)?; + /// # Ok::<(), Box>(()) + /// ``` pub fn to_bytes_with_options>( &self, options: Options, @@ -86,6 +207,35 @@ impl OpenDictionary { } impl Dictionary { + /// Build a compiled dictionary from the current dictionary data. + /// + /// This method transforms a [`Dictionary`] into an [`OpenDictionary`] by + /// serializing the dictionary data and preparing it for binary compilation. + /// The resulting [`OpenDictionary`] can then be converted to bytes or saved to disk. + /// + /// # Returns + /// + /// An [`OpenDictionary`] containing the serialized dictionary data with + /// appropriate metadata (signature, version, etc.). + /// + /// # Errors + /// + /// Returns an error if: + /// - Dictionary serialization fails + /// - Memory allocation fails + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// let compiled = dict.build()?; + /// + /// // Now you can save to disk or convert to bytes + /// compiled.to_disk("output.odict")?; + /// # Ok::<(), Box>(()) + /// ``` pub fn build(&self) -> crate::Result { let dict = OpenDictionary { signature: String::from_utf8_lossy(SIGNATURE).to_string(), diff --git a/lib/src/core/consts.rs b/lib/src/core/consts.rs index 9a100a2b..22f9b6f7 100644 --- a/lib/src/core/consts.rs +++ b/lib/src/core/consts.rs @@ -1,8 +1,78 @@ +//! Core constants for the ODict binary format. +//! +//! This module defines the fundamental constants used throughout the ODict +//! library for binary format identification, versioning, and compatibility +//! checking. +//! +//! # Overview +//! +//! The constants defined here are used for: +//! - Binary format identification through magic signatures +//! - Version tracking and compatibility verification +//! - Ensuring consistent format standards across the library +//! +//! # Binary Format Identification +//! +//! The [`SIGNATURE`] constant provides the magic bytes that identify ODict +//! binary files. This signature is written at the beginning of every compiled +//! dictionary file and verified during reading operations. +//! +//! # Version Management +//! +//! The [`VERSION`] constant contains the current library version, automatically +//! derived from the Cargo package version. This is used for compatibility +//! checking when reading dictionary files created with different library versions. + use std::sync::LazyLock; use crate::version::SemanticVersion; +/// Magic signature bytes for ODict binary format identification. +/// +/// This 5-byte signature ("ODICT") is written at the beginning of every +/// compiled dictionary file to identify it as a valid ODict binary format. +/// The signature is checked during file reading to ensure format validity. +/// +/// # Format +/// +/// The signature consists of the ASCII bytes for "ODICT": +/// - `O` (0x4F) +/// - `D` (0x44) +/// - `I` (0x49) +/// - `C` (0x43) +/// - `T` (0x54) +/// +/// # Usage +/// +/// This constant is used internally by the reading and writing operations +/// and should not typically be used directly by library consumers. pub const SIGNATURE: &[u8] = b"ODICT"; +/// Current library version for compatibility checking. +/// +/// This constant contains the semantic version of the current library, +/// automatically derived from the Cargo package version at compile time. +/// It's used to ensure compatibility between dictionary files and the +/// library version attempting to read them. +/// +/// # Compatibility Rules +/// +/// Dictionary files are considered compatible if they have: +/// - The same major version number as the library +/// - The same prerelease status (stable vs. prerelease) +/// +/// # Lazy Initialization +/// +/// The version is lazily initialized from the `CARGO_PKG_VERSION` environment +/// variable, which is automatically set by Cargo during compilation. This +/// ensures the version always matches the actual package version. +/// +/// # Examples +/// +/// ```rust +/// use odict::core::consts::VERSION; +/// +/// println!("Library version: {}", *VERSION); +/// ``` pub const VERSION: LazyLock = LazyLock::new(|| SemanticVersion::from(env!("CARGO_PKG_VERSION"))); diff --git a/lib/src/core/lexicon.rs b/lib/src/core/lexicon.rs index a85bb467..9540de77 100644 --- a/lib/src/core/lexicon.rs +++ b/lib/src/core/lexicon.rs @@ -1,8 +1,83 @@ +//! Lexicon extraction operations for ODict dictionaries. +//! +//! This module provides functionality to extract sorted lists of terms (lexicons) +//! from dictionaries. A lexicon represents all the headwords/terms available +//! in a dictionary, sorted alphabetically. +//! +//! # Overview +//! +//! The lexicon functionality allows you to: +//! - Extract all terms from a dictionary as a sorted list +//! - Get a quick overview of dictionary contents +//! - Generate word lists for analysis or display +//! +//! # Examples +//! +//! ## Extracting a Lexicon from a Dictionary +//! +//! ```rust +//! use odict::Dictionary; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! let terms = dict.lexicon(); +//! +//! // Print all terms in alphabetical order +//! for term in terms { +//! println!("{}", term); +//! } +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Working with Archived Dictionaries +//! +//! ```rust +//! use odict::OpenDictionary; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! let terms = archived.lexicon(); +//! +//! println!("Dictionary contains {} terms", terms.len()); +//! # Ok::<(), Box>(()) +//! ``` + use crate::schema::{ArchivedDictionary, Dictionary}; macro_rules! lexicon { ($t:ident) => { impl $t { + /// Extract a sorted lexicon (list of terms) from the dictionary. + /// + /// This method collects all entry terms from the dictionary and returns + /// them as a sorted vector of string references. The terms are sorted + /// alphabetically using standard string ordering. + /// + /// # Returns + /// + /// A `Vec<&str>` containing all dictionary terms in alphabetical order. + /// Each term appears exactly once, even if there are multiple entries + /// with the same term. + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// let lexicon = dict.lexicon(); + /// + /// // Print first 10 terms + /// for term in lexicon.iter().take(10) { + /// println!("{}", term); + /// } + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Performance + /// + /// This operation has O(n log n) complexity due to sorting, where n is + /// the number of entries in the dictionary. The terms are collected + /// first, then sorted in-place. pub fn lexicon(&self) -> Vec<&str> { let mut vec: Vec<&str> = self .entries diff --git a/lib/src/core/lookup.rs b/lib/src/core/lookup.rs index db23ab44..129feb9e 100644 --- a/lib/src/core/lookup.rs +++ b/lib/src/core/lookup.rs @@ -1,20 +1,216 @@ +//! Advanced dictionary lookup operations for ODict. +//! +//! This module provides sophisticated search capabilities over dictionaries with +//! configurable matching strategies, redirect following via see_also links, and +//! case-insensitive fallback options. It supports both single and batch lookups +//! with parallel processing for optimal performance. +//! +//! # Overview +//! +//! The lookup system offers multiple layers of functionality: +//! +//! ## Matching Strategies +//! - **Exact matching**: Direct term-to-entry mapping +//! - **Split strategy**: Progressive substring matching for compound terms +//! +//! ## Advanced Features +//! - **Redirect following**: Automatic traversal of see_also links with cycle protection +//! - **Case-insensitive fallback**: Automatic retry with lowercase when exact match fails +//! - **Parallel processing**: Concurrent lookup of multiple queries for performance +//! - **Configurable limits**: Control redirect depth and matching behavior +//! +//! ## Performance Characteristics +//! - Single lookups: O(1) average case for exact matches +//! - Split strategy: O(n²) worst case where n is query length +//! - Parallel lookups: Scales with available CPU cores +//! - Memory efficient: Zero-copy results with lifetime management +//! +//! # Examples +//! +//! ## Basic Exact Lookup +//! +//! ```rust +//! use odict::{OpenDictionary, LookupOptions}; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! +//! let queries = vec!["hello"]; +//! let results = archived.lookup(&queries, LookupOptions::default())?; +//! +//! for result in results { +//! println!("Found: {}", result.entry.term.as_str()); +//! } +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Advanced Lookup with Options +//! +//! ```rust +//! use odict::{OpenDictionary, LookupOptions, LookupStrategy}; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! +//! let options = LookupOptions::default() +//! .insensitive(true) // Enable case-insensitive fallback +//! .follow(3) // Follow up to 3 redirects +//! .strategy(LookupStrategy::Split(2)); // Split to minimum 2 chars +//! +//! let queries = vec!["Hello", "compound-word"]; +//! let results = archived.lookup(&queries, options)?; +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Handling Redirects +//! +//! ```rust +//! use odict::{OpenDictionary, LookupOptions}; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! +//! let options = LookupOptions::default().follow(5); +//! let queries = vec!["abbreviation"]; // Might redirect to full form +//! let results = archived.lookup(&queries, options)?; +//! +//! for result in results { +//! if let Some(redirect_from) = result.directed_from { +//! println!("'{}' redirected from '{}'", +//! result.entry.term.as_str(), +//! redirect_from.term.as_str()); +//! } +//! } +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Split Strategy for Compound Terms +//! +//! ```rust +//! use odict::{OpenDictionary, LookupOptions, LookupStrategy}; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! +//! // This will try "compound-word", then "compound", then "word" +//! let options = LookupOptions::default() +//! .strategy(LookupStrategy::Split(3)); // Minimum 3 characters +//! +//! let queries = vec!["compound-word"]; +//! let results = archived.lookup(&queries, options)?; +//! # Ok::<(), Box>(()) +//! ``` use crate::schema::{ArchivedDictionary, ArchivedEntry, Dictionary, Entry}; use rayon::prelude::*; use rkyv::option::ArchivedOption; use std::marker::{Send, Sync}; +/// Strategy for matching query terms against dictionary entries. +/// +/// This enum defines the different approaches available for finding matches +/// when performing dictionary lookups. Each strategy has different performance +/// characteristics and use cases. #[derive(Debug, PartialEq, Clone)] pub enum LookupStrategy { + /// Match queries exactly against entry terms. + /// + /// This is the fastest strategy, performing direct hash map lookups. + /// It requires the query to exactly match an entry term (case-sensitive + /// unless the `insensitive` option is enabled). + /// + /// **Performance**: O(1) average case + /// **Use case**: When you know the exact term you're looking for + /// + /// # Examples + /// + /// ```rust + /// use odict::{LookupStrategy, LookupOptions}; + /// + /// let options = LookupOptions::default() + /// .strategy(LookupStrategy::Exact); + /// ``` Exact, + + /// Split the query into progressively smaller substrings down to `min_length`, + /// attempting to match each substring from left to right. + /// + /// This strategy is useful for compound words or when you want to find + /// partial matches. It starts with the full query and progressively + /// shortens it from the right until a match is found or the minimum + /// length is reached. + /// + /// **Performance**: O(n²) worst case where n is query length + /// **Use case**: Compound words, partial matching, morphological analysis + /// + /// # Algorithm + /// + /// For a query "compound-word" with min_length=3: + /// 1. Try "compound-word" (full query) + /// 2. Try "compound-wor", "compound-wo", etc. + /// 3. Try "compound" (if found, move to next segment) + /// 4. Try "word", "wor" (down to min_length) + /// + /// # Examples + /// + /// ```rust + /// use odict::{LookupStrategy, LookupOptions}; + /// + /// // Split down to minimum 3 characters + /// let options = LookupOptions::default() + /// .strategy(LookupStrategy::Split(3)); + /// ``` Split(usize), } +/// Configuration options for dictionary lookup operations. +/// +/// This struct provides fine-grained control over lookup behavior, including +/// redirect following, matching strategies, and case sensitivity. All options +/// have sensible defaults for common use cases. +/// +/// # Default Behavior +/// +/// - **No redirect following**: Prevents infinite loops and improves performance +/// - **Exact matching**: Most predictable and fastest lookup strategy +/// - **Case-sensitive search**: Preserves linguistic distinctions +/// +/// # Examples +/// +/// ## Basic Usage +/// +/// ```rust +/// use odict::LookupOptions; +/// +/// // Use all defaults +/// let options = LookupOptions::default(); +/// ``` +/// +/// ## Custom Configuration +/// +/// ```rust +/// use odict::{LookupOptions, LookupStrategy}; +/// +/// let options = LookupOptions::default() +/// .follow(5) // Follow up to 5 redirects +/// .insensitive(true) // Enable case-insensitive fallback +/// .strategy(LookupStrategy::Split(2)); // Split strategy with min length 2 +/// ``` #[derive(Debug, Clone)] pub struct LookupOptions { /// Whether to follow see_also links until finding an entry with etymologies. pub follow: bool, pub strategy: LookupStrategy, + + /// Whether to fall back to case-insensitive search if exact match fails. + /// + /// When enabled, if an exact (case-sensitive) match fails, the system + /// will automatically retry with a lowercase version of the query. + /// This is useful for handling user input that may have incorrect + /// capitalization. + /// + /// **Note**: The fallback only occurs if the lowercase version differs + /// from the original query, preventing unnecessary duplicate lookups. pub insensitive: bool, } @@ -25,6 +221,27 @@ impl AsRef for LookupOptions { } impl LookupOptions { + /// Construct default lookup options with safe, predictable settings. + /// + /// The default configuration prioritizes safety and performance: + /// - **No redirect following**: Prevents infinite loops and improves performance + /// - **Exact matching strategy**: Most predictable and fastest lookup method + /// - **Case-sensitive search**: Preserves linguistic distinctions + /// + /// # Returns + /// + /// A new `LookupOptions` instance with default settings. + /// + /// # Examples + /// + /// ```rust + /// use odict::LookupOptions; + /// + /// let options = LookupOptions::default(); + /// assert_eq!(options.follow, None); + /// assert_eq!(options.strategy, odict::LookupStrategy::Exact); + /// assert_eq!(options.insensitive, false); + /// ``` pub fn default() -> Self { Self { follow: false, @@ -38,20 +255,140 @@ impl LookupOptions { self } + /// Set the matching strategy for query processing. + /// + /// The strategy determines how queries are matched against dictionary entries. + /// Different strategies have different performance characteristics and use cases. + /// + /// # Arguments + /// + /// * `strategy` - The [`LookupStrategy`] to use for matching + /// + /// # Examples + /// + /// ```rust + /// use odict::{LookupOptions, LookupStrategy}; + /// + /// // Use exact matching (fastest) + /// let exact = LookupOptions::default() + /// .strategy(LookupStrategy::Exact); + /// + /// // Use split strategy for compound words + /// let split = LookupOptions::default() + /// .strategy(LookupStrategy::Split(3)); + /// ``` pub fn strategy(mut self, strategy: LookupStrategy) -> Self { self.strategy = strategy; self } + /// Enable or disable case-insensitive fallback matching. + /// + /// When enabled, if an exact (case-sensitive) match fails, the system + /// automatically retries with a lowercase version of the query. This is + /// useful for handling user input with incorrect capitalization. + /// + /// # Arguments + /// + /// * `insensitive` - Whether to enable case-insensitive fallback + /// + /// # Performance Impact + /// + /// - Minimal impact when exact matches succeed + /// - Adds one additional lookup when exact match fails and query contains uppercase + /// - No additional lookup if the query is already lowercase + /// + /// # Examples + /// + /// ```rust + /// use odict::LookupOptions; + /// + /// // Enable case-insensitive fallback + /// let options = LookupOptions::default().insensitive(true); + /// + /// // This will try "Hello" first, then "hello" if not found + /// // let results = dict.lookup(&["Hello"], options)?; + /// ``` pub fn insensitive(mut self, insensitive: bool) -> Self { self.insensitive = insensitive; self } } +/// Result of a dictionary lookup operation. +/// +/// This struct encapsulates the result of a successful lookup, including +/// the matched entry and optional redirect information. It provides context +/// about how the match was found, which is useful for understanding the +/// lookup path and handling redirects. +/// +/// # Generic Parameter +/// +/// * `E` - The entry type (either `&Entry` or `&ArchivedEntry`) +/// +/// # Examples +/// +/// ## Basic Usage +/// +/// ```rust +/// use odict::{OpenDictionary, LookupOptions}; +/// +/// let dict = OpenDictionary::from_path("dictionary.odict")?; +/// let archived = dict.contents()?; +/// let queries = vec!["hello"]; +/// let results = archived.lookup(&queries, LookupOptions::default())?; +/// +/// for result in results { +/// println!("Found: {}", result.entry.term.as_str()); +/// +/// if let Some(redirect_from) = result.directed_from { +/// println!(" (redirected from: {})", redirect_from.term.as_str()); +/// } +/// } +/// # Ok::<(), Box>(()) +/// ``` +/// +/// ## Checking for Redirects +/// +/// ```rust +/// use odict::{OpenDictionary, LookupOptions}; +/// +/// # fn example(results: Vec>) { +/// for result in results { +/// match result.directed_from { +/// Some(original) => { +/// println!("'{}' is an alias for '{}'", +/// original.term.as_str(), +/// result.entry.term.as_str()); +/// } +/// None => { +/// println!("Direct match: {}", result.entry.term.as_str()); +/// } +/// } +/// } +/// # } +/// ``` #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct LookupResult { + /// The matched dictionary entry. + /// + /// This is the final entry that was found, either through direct matching + /// or by following redirects. It contains all the linguistic data + /// (definitions, etymologies, pronunciations, etc.) for the term. pub entry: E, + + /// The entry that originally directed to this match via see_also links. + /// + /// This field is `Some(entry)` when the result was found by following + /// a redirect chain, containing the entry that started the redirect. + /// It's `None` for direct matches without any redirects. + /// + /// # Use Cases + /// + /// - Displaying "redirected from" information to users + /// - Understanding alias relationships in the dictionary + /// - Debugging lookup paths and redirect chains + /// - Analytics on which redirects are commonly followed pub directed_from: Option, } @@ -62,6 +399,12 @@ pub struct LookupResult { macro_rules! lookup { ($tys:ident, $ret:ident, $opt:ident) => { impl $tys { + #[doc = r#"Attempt to find a single entry by term. + +This helper supports optional redirect following and an optional +case-insensitive retry (lowercasing the query) when configured. + +Returns Some(LookupResult) on a match, or None if not found."#] fn find_entry<'a>( &'a self, follow: &bool, @@ -132,6 +475,9 @@ macro_rules! lookup { Ok($opt::None) } + #[doc = r#"Perform lookup for a single query using the provided options. + +Depending on the strategy, this may return zero or more results."#] fn perform_lookup<'a, Options>( &'a self, query: &str, @@ -196,6 +542,26 @@ macro_rules! lookup { Ok(results) } + #[doc = r#"Lookup multiple queries in parallel. + +Each query is processed independently with the provided options. + +Returns all matches without a guaranteed order. + +Examples +-------- +```rust +use odict::{OpenDictionary, LookupOptions, LookupStrategy}; +# fn demo(dict: &odict::OpenDictionary) -> odict::Result<()> { +let archived = dict.contents()?; +let queries = vec!["hello", "world"]; +let options = LookupOptions::default() + .insensitive(true) + .strategy(LookupStrategy::Exact); +let results = archived.lookup(&queries, options)?; +# Ok(()) +# } +```"#] pub fn lookup<'a, 'b, Query, Options>( &'a self, queries: &'b Vec, diff --git a/lib/src/core/merge.rs b/lib/src/core/merge.rs index 9944cbc8..bb65f115 100644 --- a/lib/src/core/merge.rs +++ b/lib/src/core/merge.rs @@ -1,12 +1,104 @@ +//! Dictionary merging operations for ODict. +//! +//! This module provides functionality to combine multiple dictionaries into a single +//! dictionary, preserving unique entries and avoiding duplicates. +//! +//! # Overview +//! +//! The merge operations allow you to: +//! - Merge a single dictionary into another +//! - Merge multiple dictionaries at once +//! - Preserve unique entries (no duplicates) +//! +//! # Examples +//! +//! ## Merging Two Dictionaries +//! +//! ```rust +//! use odict::Dictionary; +//! +//! let mut dict1 = Dictionary::from_path("dict1.xml")?; +//! let dict2 = Dictionary::from_path("dict2.xml")?; +//! +//! // Merge dict2 into dict1 +//! dict1.merge(&dict2); +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Merging Multiple Dictionaries +//! +//! ```rust +//! use odict::Dictionary; +//! +//! let mut main_dict = Dictionary::from_path("main.xml")?; +//! let dict2 = Dictionary::from_path("dict2.xml")?; +//! let dict3 = Dictionary::from_path("dict3.xml")?; +//! +//! // Merge multiple dictionaries at once +//! main_dict.merge_multi(vec![&dict2, &dict3]); +//! # Ok::<(), Box>(()) +//! ``` + use crate::schema::Dictionary; impl Dictionary { + /// Merge multiple dictionaries into this dictionary. + /// + /// This is a convenience method that calls [`merge`](Dictionary::merge) for each + /// dictionary in the provided vector. Entries are processed in order, and + /// duplicates are automatically filtered out. + /// + /// # Arguments + /// + /// * `dictionaries` - A vector of dictionary references to merge + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let mut main_dict = Dictionary::from_path("main.xml")?; + /// let dict2 = Dictionary::from_path("dict2.xml")?; + /// let dict3 = Dictionary::from_path("dict3.xml")?; + /// + /// main_dict.merge_multi(vec![&dict2, &dict3]); + /// # Ok::<(), Box>(()) + /// ``` pub fn merge_multi(&mut self, dictionaries: Vec<&Dictionary>) { for src in dictionaries { self.merge(src); } } + /// Merge another dictionary into this dictionary. + /// + /// This method adds all entries from the source dictionary that are not + /// already present in this dictionary. Duplicate entries (based on the + /// entry's equality implementation) are automatically filtered out. + /// + /// # Arguments + /// + /// * `dictionary` - The source dictionary to merge from + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let mut dict1 = Dictionary::from_path("dict1.xml")?; + /// let dict2 = Dictionary::from_path("dict2.xml")?; + /// + /// // Merge dict2 into dict1 + /// dict1.merge(&dict2); + /// + /// // dict1 now contains all unique entries from both dictionaries + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Performance + /// + /// The merge operation has O(n) complexity where n is the number of entries + /// in the source dictionary. Each entry is checked for existence before insertion. pub fn merge(&mut self, dictionary: &Dictionary) { for entry in dictionary.entries.iter() { if !self.entries.contains(entry) { diff --git a/lib/src/core/mod.rs b/lib/src/core/mod.rs index 976addb3..a428c4e9 100644 --- a/lib/src/core/mod.rs +++ b/lib/src/core/mod.rs @@ -1,3 +1,62 @@ +//! Core functionality for the ODict dictionary format. +//! +//! This module provides the fundamental operations for working with ODict dictionaries, +//! including compilation, reading, writing, lookup, and various utility functions. +//! +//! # Overview +//! +//! The core module is organized into several key areas: +//! +//! - **Compilation & Serialization**: [`compile`] - Convert dictionaries to binary format +//! - **Reading & Deserialization**: [`read`] - Load dictionaries from various sources +//! - **Writing**: [`write`] - Save dictionaries to disk +//! - **Lookup Operations**: [`lookup`] - Search and retrieve dictionary entries +//! - **Dictionary Management**: [`merge`], [`lexicon`] - Combine dictionaries and extract terms +//! - **Utilities**: [`preview`], [`rank`], [`resolve`] - Additional dictionary operations +//! - **Version Management**: [`version`] - Semantic versioning support +//! +//! # Examples +//! +//! ## Basic Dictionary Operations +//! +//! ```rust +//! use odict::{Dictionary, OpenDictionary}; +//! +//! // Load a dictionary from XML +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! +//! // Compile to binary format +//! let compiled = dict.build()?; +//! +//! // Save to disk +//! compiled.to_disk("dictionary.odict")?; +//! +//! // Load from binary +//! let loaded = OpenDictionary::from_path("dictionary.odict")?; +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Dictionary Lookup +//! +//! ```rust +//! use odict::{OpenDictionary, LookupOptions, LookupStrategy}; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! +//! // Simple lookup +//! let queries = vec!["hello"]; +//! let results = archived.lookup(&queries, LookupOptions::default())?; +//! +//! // Advanced lookup with options +//! let options = LookupOptions::default() +//! .insensitive(true) +//! .follow(5) +//! .strategy(LookupStrategy::Split(2)); +//! let results = archived.lookup(&queries, options)?; +//! # Ok::<(), Box>(()) +//! ``` + mod consts; pub mod compile; diff --git a/lib/src/core/preview.rs b/lib/src/core/preview.rs index c498ec9c..9d29d275 100644 --- a/lib/src/core/preview.rs +++ b/lib/src/core/preview.rs @@ -1,12 +1,63 @@ +//! Entry preview generation for ODict dictionaries. +//! +//! This module provides functionality to generate concise text previews of dictionary +//! entries by extracting and concatenating their definitions. Previews are useful for +//! displaying quick summaries of entries without showing the full structured data. +//! +//! # Overview +//! +//! The preview functionality allows you to: +//! - Generate text summaries of dictionary entries +//! - Customize the delimiter used to separate definitions +//! - Handle both regular and grouped definitions +//! - Optionally convert markdown to plain text (when markdown feature is enabled) +//! +//! # Examples +//! +//! ## Basic Preview Generation +//! +//! ```rust +//! use odict::{Dictionary, PreviewOptions}; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! if let Some(entry) = dict.entries.iter().next() { +//! let preview = entry.preview(PreviewOptions::default()); +//! println!("Preview: {}", preview); +//! } +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Custom Delimiter +//! +//! ```rust +//! use odict::{Dictionary, PreviewOptions}; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! if let Some(entry) = dict.entries.iter().next() { +//! let options = PreviewOptions::default().delimiter(" | ".to_string()); +//! let preview = entry.preview(options); +//! println!("Preview: {}", preview); +//! } +//! # Ok::<(), Box>(()) +//! ``` + #[cfg(feature = "markdown")] use crate::md::to_text; use crate::schema::{ArchivedDefinitionType, ArchivedEntry, DefinitionType, Entry}; +/// Configuration options for generating entry previews. +/// +/// This struct allows customization of how definitions are joined together +/// when creating a preview string from a dictionary entry. pub struct PreviewOptions { delimiter: String, } impl Default for PreviewOptions { + /// Create default preview options. + /// + /// The default delimiter is `"; "` (semicolon followed by space), which + /// provides a natural separation between multiple definitions. fn default() -> Self { Self { delimiter: "; ".to_string(), @@ -15,12 +66,30 @@ impl Default for PreviewOptions { } impl PreviewOptions { + /// Set a custom delimiter for joining definitions. + /// + /// # Arguments + /// + /// * `delimiter` - The string to use for separating definitions in the preview + /// + /// # Examples + /// + /// ```rust + /// use odict::PreviewOptions; + /// + /// let options = PreviewOptions::default() + /// .delimiter(" | ".to_string()); + /// ``` pub fn delimiter(mut self, delimiter: String) -> Self { self.delimiter = delimiter; self } } +/// Convert text content to plain text. +/// +/// When the markdown feature is disabled, this function returns the input unchanged. +/// When the markdown feature is enabled, it converts markdown to plain text. #[cfg(not(feature = "markdown"))] fn to_text(value: &str) -> &str { value @@ -29,6 +98,48 @@ fn to_text(value: &str) -> &str { macro_rules! preview { ($t:ident, $d:ident) => { impl $t { + /// Generate a text preview of this dictionary entry. + /// + /// This method extracts all definitions from the entry's etymologies and senses, + /// converts them to plain text (if markdown feature is enabled), and joins them + /// using the specified delimiter. + /// + /// # Arguments + /// + /// * `options` - Configuration for preview generation + /// + /// # Returns + /// + /// A `String` containing all definitions joined by the specified delimiter. + /// If the entry has no definitions, returns an empty string. + /// + /// # Examples + /// + /// ```rust + /// use odict::{Dictionary, PreviewOptions}; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// if let Some(entry) = dict.entries.iter().next() { + /// // Use default options ("; " delimiter) + /// let preview = entry.preview(PreviewOptions::default()); + /// + /// // Use custom delimiter + /// let custom_preview = entry.preview( + /// PreviewOptions::default().delimiter(" | ".to_string()) + /// ); + /// } + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Processing Order + /// + /// Definitions are processed in this order: + /// 1. Iterate through etymologies + /// 2. For each etymology, iterate through senses + /// 3. For each sense, iterate through definitions + /// 4. Handle both individual definitions and definition groups + /// 5. Convert markdown to text (if feature enabled) + /// 6. Join all definitions with the specified delimiter pub fn preview(&self, options: PreviewOptions) -> String { let definitions: Vec = self .etymologies diff --git a/lib/src/core/rank.rs b/lib/src/core/rank.rs index 75d92139..095e2e0a 100644 --- a/lib/src/core/rank.rs +++ b/lib/src/core/rank.rs @@ -1,6 +1,65 @@ +//! Entry ranking operations for ODict dictionaries. +//! +//! This module provides functionality to analyze and extract ranking information +//! from dictionary entries. Rankings are typically used to indicate word frequency, +//! importance, or usage patterns within a dictionary. +//! +//! # Overview +//! +//! The ranking functionality allows you to: +//! - Find the minimum rank across all entries +//! - Find the maximum rank across all entries +//! - Analyze ranking distribution in dictionaries +//! +//! # Ranking System +//! +//! Rankings are optional numeric values associated with dictionary entries. +//! Lower numbers typically indicate higher frequency or importance (e.g., rank 1 +//! might be the most common word). Not all entries are required to have ranks. +//! +//! # Examples +//! +//! ## Finding Rank Range +//! +//! ```rust +//! use odict::Dictionary; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! +//! if let Some(min) = dict.min_rank() { +//! println!("Highest priority rank: {}", min); +//! } +//! +//! if let Some(max) = dict.max_rank() { +//! println!("Lowest priority rank: {}", max); +//! } +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Working with Archived Dictionaries +//! +//! ```rust +//! use odict::OpenDictionary; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! +//! match (archived.min_rank(), archived.max_rank()) { +//! (Some(min), Some(max)) => { +//! println!("Rank range: {} to {}", min, max); +//! } +//! _ => println!("No ranked entries found"), +//! } +//! # Ok::<(), Box>(()) +//! ``` + use crate::schema::{ArchivedDictionary, Dictionary}; impl ArchivedDictionary { + /// Create an iterator over all rank values in the archived dictionary. + /// + /// This internal method filters entries to only those with rank values, + /// converting archived rank values to native u32 format. fn rank_iter(&self) -> impl Iterator + '_ { self.entries .iter() @@ -9,6 +68,9 @@ impl ArchivedDictionary { } impl Dictionary { + /// Create an iterator over all rank values in the dictionary. + /// + /// This internal method filters entries to only those with rank values. fn rank_iter(&self) -> impl Iterator + '_ { self.entries.iter().filter_map(|entry| entry.rank) } @@ -17,10 +79,68 @@ impl Dictionary { macro_rules! rank { ($t:ident) => { impl $t { + /// Find the minimum rank value across all entries in the dictionary. + /// + /// This method searches through all entries that have rank values and + /// returns the smallest rank number. Since lower ranks typically indicate + /// higher importance or frequency, this represents the "highest priority" entry. + /// + /// # Returns + /// + /// - `Some(u32)` - The minimum rank value if any entries have ranks + /// - `None` - If no entries in the dictionary have rank values + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// + /// match dict.min_rank() { + /// Some(min_rank) => println!("Most important entry has rank: {}", min_rank), + /// None => println!("No entries have rank information"), + /// } + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Performance + /// + /// This operation has O(n) complexity where n is the number of entries + /// in the dictionary, as it must examine all entries to find the minimum. pub fn min_rank(&self) -> Option { self.rank_iter().min() } + /// Find the maximum rank value across all entries in the dictionary. + /// + /// This method searches through all entries that have rank values and + /// returns the largest rank number. Since higher ranks typically indicate + /// lower importance or frequency, this represents the "lowest priority" entry. + /// + /// # Returns + /// + /// - `Some(u32)` - The maximum rank value if any entries have ranks + /// - `None` - If no entries in the dictionary have rank values + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// + /// match dict.max_rank() { + /// Some(max_rank) => println!("Least important entry has rank: {}", max_rank), + /// None => println!("No entries have rank information"), + /// } + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Performance + /// + /// This operation has O(n) complexity where n is the number of entries + /// in the dictionary, as it must examine all entries to find the maximum. pub fn max_rank(&self) -> Option { self.rank_iter().max() } diff --git a/lib/src/core/read.rs b/lib/src/core/read.rs index c6036ffc..66e31640 100644 --- a/lib/src/core/read.rs +++ b/lib/src/core/read.rs @@ -1,3 +1,60 @@ +//! Dictionary reading and deserialization operations for ODict. +//! +//! This module provides functionality to read and deserialize dictionaries from +//! various sources, including XML files and binary ODict format files. It handles +//! format validation, version compatibility checking, and decompression. +//! +//! # Overview +//! +//! The reading functionality supports: +//! - Loading dictionaries from XML files +//! - Loading compiled dictionaries from binary ODict files +//! - Reading from file paths or byte arrays +//! - Automatic format detection and validation +//! - Version compatibility verification +//! - Decompression of binary content +//! +//! # Binary Format Structure +//! +//! The ODict binary format consists of: +//! 1. **Signature** (5 bytes): "ODICT" magic bytes for format identification +//! 2. **Version Length** (8 bytes): Length of the version string in little-endian +//! 3. **Version** (variable): UTF-8 encoded semantic version string +//! 4. **Content Length** (8 bytes): Length of compressed content in little-endian +//! 5. **Content** (variable): Compressed serialized dictionary data +//! +//! # Examples +//! +//! ## Loading from XML +//! +//! ```rust +//! use odict::Dictionary; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! println!("Loaded {} entries", dict.entries.len()); +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Loading from Binary Format +//! +//! ```rust +//! use odict::OpenDictionary; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! println!("Dictionary version: {}", dict.version); +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Loading from Bytes +//! +//! ```rust +//! use odict::OpenDictionary; +//! +//! let bytes = std::fs::read("dictionary.odict")?; +//! let dict = OpenDictionary::from_bytes(&bytes)?; +//! # Ok::<(), Box>(()) +//! ``` + use std::{ io::{Cursor, Read}, path::Path, @@ -20,6 +77,23 @@ use std::str::FromStr; /* Helper Methods */ /* -------------------------------------------------------------------------- */ +/// Read and validate the ODict signature from a binary stream. +/// +/// This function reads the first 5 bytes from the stream and verifies they +/// match the expected "ODICT" signature. This ensures the file is a valid +/// ODict binary format. +/// +/// # Arguments +/// +/// * `reader` - A cursor over the binary data +/// +/// # Returns +/// +/// The signature as a string if valid, or an error if invalid. +/// +/// # Errors +/// +/// Returns [`Error::InvalidSignature`] if the signature doesn't match "ODICT". fn read_signature(reader: &mut Cursor) -> crate::Result where T: AsRef<[u8]>, @@ -37,6 +111,23 @@ where Ok(String::from_utf8(signature)?) } +/// Read and validate the version information from a binary stream. +/// +/// This function reads the version length, then the version string, and +/// validates that it's compatible with the current library version. +/// +/// # Arguments +/// +/// * `reader` - A cursor over the binary data +/// +/// # Returns +/// +/// The parsed semantic version if compatible, or an error if incompatible. +/// +/// # Errors +/// +/// Returns [`Error::Incompatible`] if the version is not compatible with +/// the current library version. fn read_version(reader: &mut Cursor) -> crate::Result where T: AsRef<[u8]>, @@ -58,6 +149,22 @@ where Ok(version) } +/// Read and decompress the dictionary content from a binary stream. +/// +/// This function reads the content length, then the compressed content, +/// and decompresses it to obtain the raw serialized dictionary data. +/// +/// # Arguments +/// +/// * `reader` - A cursor over the binary data +/// +/// # Returns +/// +/// The decompressed content as a byte vector. +/// +/// # Errors +/// +/// Returns an error if decompression fails or if the content is corrupted. fn read_content(reader: &mut Cursor) -> crate::Result> where T: AsRef<[u8]>, @@ -76,10 +183,44 @@ where /* DictionaryReader */ /* -------------------------------------------------------------------------- */ +/// A reader for dictionary operations. +/// +/// This struct provides a namespace for dictionary reading operations, +/// though most functionality is implemented directly on the dictionary types. #[derive(Clone, Debug, Default)] pub struct DictionaryReader {} impl Dictionary { + /// Load a dictionary from an XML file. + /// + /// This method reads an XML file from the specified path and parses it + /// into a [`Dictionary`] structure. The XML must conform to the ODict + /// schema format. + /// + /// # Arguments + /// + /// * `path` - Path to the XML dictionary file + /// + /// # Returns + /// + /// A [`Dictionary`] instance containing the parsed data. + /// + /// # Errors + /// + /// Returns an error if: + /// - The file cannot be read + /// - The XML is malformed or doesn't conform to the ODict schema + /// - File system permissions prevent access + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let dict = Dictionary::from_path("examples/dictionary.xml")?; + /// println!("Loaded dictionary with {} entries", dict.entries.len()); + /// # Ok::<(), Box>(()) + /// ``` pub fn from_path>(path: P) -> crate::Result { let buffer = crate::fs::read_to_string(path)?; Self::from_str(&buffer) @@ -87,6 +228,38 @@ impl Dictionary { } impl OpenDictionary { + /// Load a compiled dictionary from binary data. + /// + /// This method parses binary data in the ODict format, validating the + /// signature, checking version compatibility, and decompressing the content. + /// The resulting [`OpenDictionary`] can be used for fast lookups and operations. + /// + /// # Arguments + /// + /// * `data` - Binary data in ODict format + /// + /// # Returns + /// + /// An [`OpenDictionary`] instance ready for use. + /// + /// # Errors + /// + /// Returns an error if: + /// - The signature is invalid (not an ODict file) + /// - The version is incompatible with this library + /// - The content cannot be decompressed + /// - The binary format is corrupted + /// + /// # Examples + /// + /// ```rust + /// use odict::OpenDictionary; + /// + /// let bytes = std::fs::read("dictionary.odict")?; + /// let dict = OpenDictionary::from_bytes(&bytes)?; + /// println!("Dictionary version: {}", dict.version); + /// # Ok::<(), Box>(()) + /// ``` pub fn from_bytes(data: T) -> crate::Result where T: AsRef<[u8]>, @@ -104,6 +277,39 @@ impl OpenDictionary { }) } + /// Load a compiled dictionary from a binary file. + /// + /// This method reads a binary ODict file from the specified path and + /// loads it into an [`OpenDictionary`] instance. The file path is stored + /// for reference. + /// + /// # Arguments + /// + /// * `path` - Path to the binary ODict file + /// + /// # Returns + /// + /// An [`OpenDictionary`] instance with the path information preserved. + /// + /// # Errors + /// + /// Returns an error if: + /// - The file cannot be read + /// - The file is not a valid ODict binary format + /// - Version compatibility issues + /// - File system permissions prevent access + /// + /// # Examples + /// + /// ```rust + /// use odict::OpenDictionary; + /// + /// let dict = OpenDictionary::from_path("dictionary.odict")?; + /// if let Some(path) = &dict.path { + /// println!("Loaded from: {}", path.display()); + /// } + /// # Ok::<(), Box>(()) + /// ``` pub fn from_path>(path: P) -> crate::Result { let buffer = fs::read_to_bytes(&path)?; let mut result = Self::from_bytes(&buffer)?; diff --git a/lib/src/core/resolve.rs b/lib/src/core/resolve.rs index 9b4f541f..f03a2e45 100644 --- a/lib/src/core/resolve.rs +++ b/lib/src/core/resolve.rs @@ -1,8 +1,103 @@ +//! Entry resolution operations for ODict dictionaries. +//! +//! This module provides functionality to resolve (look up) dictionary entries +//! by their exact term. Resolution is a simple, direct lookup operation that +//! returns the entry if it exists, or None if not found. +//! +//! # Overview +//! +//! The resolve functionality allows you to: +//! - Look up entries by exact term match +//! - Get direct access to entry data structures +//! - Perform fast O(1) lookups using the underlying hash map +//! +//! # Difference from Lookup +//! +//! Resolution differs from the more complex lookup operations in that it: +//! - Only performs exact matches (no fuzzy matching or strategies) +//! - Does not follow redirects or see_also links +//! - Does not support case-insensitive fallback +//! - Returns the raw entry structure rather than wrapped results +//! +//! # Examples +//! +//! ## Basic Entry Resolution +//! +//! ```rust +//! use odict::Dictionary; +//! +//! let dict = Dictionary::from_path("dictionary.xml")?; +//! +//! if let Some(entry) = dict.resolve("hello") { +//! println!("Found entry for 'hello': {}", entry.term); +//! } else { +//! println!("No entry found for 'hello'"); +//! } +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Working with Archived Dictionaries +//! +//! ```rust +//! use odict::OpenDictionary; +//! +//! let dict = OpenDictionary::from_path("dictionary.odict")?; +//! let archived = dict.contents()?; +//! +//! if let Some(entry) = archived.resolve("world") { +//! println!("Found archived entry: {}", entry.term.as_str()); +//! } +//! # Ok::<(), Box>(()) +//! ``` + use crate::schema::{ArchivedDictionary, ArchivedEntry, Dictionary, Entry}; macro_rules! resolve { ($t:ident, $ret:ident) => { impl $t { + /// Resolve a dictionary entry by exact term match. + /// + /// This method performs a direct lookup in the dictionary's entry collection + /// using the provided term as the key. The lookup is case-sensitive and + /// requires an exact match. + /// + /// # Arguments + /// + /// * `term` - The exact term to look up in the dictionary + /// + /// # Returns + /// + /// - `Some(&Entry)` - A reference to the entry if found + /// - `None` - If no entry exists with the exact term + /// + /// # Examples + /// + /// ```rust + /// use odict::Dictionary; + /// + /// let dict = Dictionary::from_path("dictionary.xml")?; + /// + /// // Exact match lookup + /// if let Some(entry) = dict.resolve("hello") { + /// println!("Term: {}", entry.term); + /// println!("Etymologies: {}", entry.etymologies.len()); + /// } + /// + /// // Case-sensitive - this might not match if entry is "Hello" + /// let result = dict.resolve("Hello"); + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Performance + /// + /// This operation has O(1) average time complexity as it uses the underlying + /// hash map for direct key lookup. In the worst case (hash collisions), it + /// may degrade to O(n) but this is rare in practice. + /// + /// # See Also + /// + /// For more advanced lookup operations with fuzzy matching, case-insensitive + /// search, and redirect following, see the [`lookup`](crate::core::lookup) module. pub fn resolve<'a>(&'a self, term: &str) -> Option<&'a $ret> { self.entries.get(term) } diff --git a/lib/src/core/version.rs b/lib/src/core/version.rs index 605da252..6e592d2a 100644 --- a/lib/src/core/version.rs +++ b/lib/src/core/version.rs @@ -1,17 +1,100 @@ +//! Semantic versioning support for ODict dictionaries. +//! +//! This module provides a semantic versioning implementation that follows the +//! [Semantic Versioning 2.0.0](https://semver.org/) specification. It's used +//! to track dictionary format versions and ensure compatibility between different +//! versions of the ODict library. +//! +//! # Overview +//! +//! The semantic versioning functionality provides: +//! - Version parsing from strings +//! - Version comparison and ordering +//! - Compatibility checking between versions +//! - Prerelease version support +//! +//! # Compatibility Rules +//! +//! Two versions are considered compatible if: +//! - They have the same major version number +//! - They have the same prerelease status (both stable or both prerelease) +//! +//! # Examples +//! +//! ## Creating and Comparing Versions +//! +//! ```rust +//! use odict::SemanticVersion; +//! +//! let v1 = SemanticVersion::new(1, 2, 3, None); +//! let v2: SemanticVersion = "1.2.4".into(); +//! let v3: SemanticVersion = "2.0.0".into(); +//! +//! assert!(v1 < v2); +//! assert!(v1.is_compatible(&v2)); +//! assert!(!v1.is_compatible(&v3)); +//! ``` +//! +//! ## Working with Prerelease Versions +//! +//! ```rust +//! use odict::SemanticVersion; +//! +//! let stable: SemanticVersion = "1.0.0".into(); +//! let prerelease: SemanticVersion = "1.0.0-alpha".into(); +//! +//! assert!(prerelease < stable); +//! assert!(!stable.is_compatible(&prerelease)); +//! ``` + use std::{ cmp::Ordering, fmt::{Display, Formatter}, }; +/// A semantic version following the Semantic Versioning 2.0.0 specification. +/// +/// This struct represents a version number in the format `MAJOR.MINOR.PATCH[-PRERELEASE]` +/// where each component has specific meaning: +/// - **MAJOR**: Incremented for incompatible API changes +/// - **MINOR**: Incremented for backwards-compatible functionality additions +/// - **PATCH**: Incremented for backwards-compatible bug fixes +/// - **PRERELEASE**: Optional identifier for pre-release versions #[derive(Debug, Clone, Eq, PartialEq)] pub struct SemanticVersion { + /// Major version number (incompatible API changes) pub major: u64, + /// Minor version number (backwards-compatible additions) pub minor: u64, + /// Patch version number (backwards-compatible fixes) pub patch: u64, + /// Optional prerelease identifier (e.g., "alpha", "beta", "rc.1") pub prerelease: Option, } impl SemanticVersion { + /// Create a new semantic version. + /// + /// # Arguments + /// + /// * `major` - Major version number + /// * `minor` - Minor version number + /// * `patch` - Patch version number + /// * `prerelease` - Optional prerelease identifier + /// + /// # Examples + /// + /// ```rust + /// use odict::SemanticVersion; + /// + /// // Stable version + /// let stable = SemanticVersion::new(1, 2, 3, None); + /// assert_eq!(stable.to_string(), "1.2.3"); + /// + /// // Prerelease version + /// let prerelease = SemanticVersion::new(1, 2, 3, Some("alpha".to_string())); + /// assert_eq!(prerelease.to_string(), "1.2.3-alpha"); + /// ``` pub fn new(major: u64, minor: u64, patch: u64, prerelease: Option) -> Self { Self { major, @@ -21,10 +104,56 @@ impl SemanticVersion { } } + /// Check if this version is compatible with another version. + /// + /// Two versions are compatible if they have the same major version and + /// the same prerelease status (both stable or both prerelease with the + /// same identifier). + /// + /// # Arguments + /// + /// * `other` - The version to check compatibility against + /// + /// # Returns + /// + /// `true` if the versions are compatible, `false` otherwise. + /// + /// # Examples + /// + /// ```rust + /// use odict::SemanticVersion; + /// + /// let v1_0_0: SemanticVersion = "1.0.0".into(); + /// let v1_2_3: SemanticVersion = "1.2.3".into(); + /// let v2_0_0: SemanticVersion = "2.0.0".into(); + /// let v1_0_0_alpha: SemanticVersion = "1.0.0-alpha".into(); + /// + /// assert!(v1_0_0.is_compatible(&v1_2_3)); // Same major version + /// assert!(!v1_0_0.is_compatible(&v2_0_0)); // Different major version + /// assert!(!v1_0_0.is_compatible(&v1_0_0_alpha)); // Different prerelease status + /// ``` pub fn is_compatible(&self, other: &Self) -> bool { self.major == other.major && self.prerelease.as_deref() == other.prerelease.as_deref() } + /// Convert the version to a byte vector. + /// + /// This method converts the version string representation to UTF-8 bytes, + /// which is useful for serialization and storage in binary formats. + /// + /// # Returns + /// + /// A `Vec` containing the UTF-8 encoded version string. + /// + /// # Examples + /// + /// ```rust + /// use odict::SemanticVersion; + /// + /// let version: SemanticVersion = "1.2.3".into(); + /// let bytes = version.as_bytes(); + /// assert_eq!(bytes, b"1.2.3"); + /// ``` pub fn as_bytes(&self) -> Vec { self.to_string().into_bytes() } diff --git a/lib/src/core/write.rs b/lib/src/core/write.rs index 21d33ea1..066a4069 100644 --- a/lib/src/core/write.rs +++ b/lib/src/core/write.rs @@ -1,3 +1,54 @@ +//! Dictionary writing and persistence operations for ODict. +//! +//! This module provides functionality to save compiled dictionaries to disk +//! in the binary ODict format. It handles file creation, binary serialization, +//! and path management for persistent storage. +//! +//! # Overview +//! +//! The writing functionality allows you to: +//! - Save compiled dictionaries to disk +//! - Customize compilation options during save +//! - Automatically update path references +//! - Ensure data integrity through proper file handling +//! +//! # File Format +//! +//! Dictionaries are saved in the binary ODict format, which includes: +//! - Format signature and version information +//! - Compressed serialized dictionary data +//! - Metadata for compatibility checking +//! +//! # Examples +//! +//! ## Basic Dictionary Saving +//! +//! ```rust +//! use odict::{Dictionary, OpenDictionary}; +//! +//! let dict = Dictionary::from_path("source.xml")?; +//! let mut compiled = dict.build()?; +//! +//! // Save with default options +//! compiled.to_disk("output.odict")?; +//! # Ok::<(), Box>(()) +//! ``` +//! +//! ## Saving with Custom Options +//! +//! ```rust +//! use odict::{Dictionary, OpenDictionary, CompilerOptions, CompressOptions}; +//! +//! let dict = Dictionary::from_path("source.xml")?; +//! let mut compiled = dict.build()?; +//! +//! let options = CompilerOptions::default() +//! .with_compression(CompressOptions::default()); +//! +//! compiled.to_disk_with_options("output.odict", options)?; +//! # Ok::<(), Box>(()) +//! ``` + use std::fs::canonicalize; use std::path::Path; use std::{fs::File, io::Write}; @@ -6,10 +57,108 @@ use crate::compile::CompilerOptions; use crate::OpenDictionary; impl OpenDictionary { + /// Save the dictionary to disk using default compilation options. + /// + /// This method writes the dictionary to the specified file path in the + /// binary ODict format. It uses default compression settings and updates + /// the dictionary's internal path reference to the saved location. + /// + /// # Arguments + /// + /// * `path` - The file path where the dictionary should be saved + /// + /// # Returns + /// + /// `Ok(())` if the save operation succeeds. + /// + /// # Errors + /// + /// Returns an error if: + /// - The file cannot be created or written to + /// - Compilation/compression fails + /// - File system permissions prevent writing + /// - The path cannot be canonicalized + /// + /// # Side Effects + /// + /// - Creates or overwrites the file at the specified path + /// - Updates the dictionary's internal path reference + /// - Ensures all data is flushed to disk + /// + /// # Examples + /// + /// ```rust + /// use odict::{Dictionary, OpenDictionary}; + /// + /// let dict = Dictionary::from_path("source.xml")?; + /// let mut compiled = dict.build()?; + /// + /// compiled.to_disk("my_dictionary.odict")?; + /// + /// // Path is now updated + /// if let Some(path) = &compiled.path { + /// println!("Saved to: {}", path.display()); + /// } + /// # Ok::<(), Box>(()) + /// ``` pub fn to_disk>(&mut self, path: P) -> crate::Result<()> { self.to_disk_with_options(path, CompilerOptions::default()) } + /// Save the dictionary to disk with custom compilation options. + /// + /// This method provides fine-grained control over the save process, + /// allowing customization of compression settings and other compilation + /// options. The dictionary is written in the binary ODict format. + /// + /// # Arguments + /// + /// * `path` - The file path where the dictionary should be saved + /// * `options` - Compilation options to customize the save process + /// + /// # Returns + /// + /// `Ok(())` if the save operation succeeds. + /// + /// # Errors + /// + /// Returns an error if: + /// - The file cannot be created or written to + /// - Compilation fails with the specified options + /// - Compression fails + /// - File system permissions prevent writing + /// - The path cannot be canonicalized + /// + /// # Side Effects + /// + /// - Creates or overwrites the file at the specified path + /// - Updates the dictionary's internal path reference to the canonical path + /// - Ensures all data is properly flushed to disk + /// + /// # Examples + /// + /// ```rust + /// use odict::{Dictionary, OpenDictionary, CompilerOptions, CompressOptions}; + /// + /// let dict = Dictionary::from_path("source.xml")?; + /// let mut compiled = dict.build()?; + /// + /// // Use custom compression settings + /// let options = CompilerOptions::default() + /// .with_compression(CompressOptions::default()); + /// + /// compiled.to_disk_with_options("optimized.odict", options)?; + /// # Ok::<(), Box>(()) + /// ``` + /// + /// # Performance + /// + /// The save operation involves: + /// 1. Compiling the dictionary to binary format with specified options + /// 2. Creating/opening the target file + /// 3. Writing all data to disk + /// 4. Flushing to ensure data persistence + /// 5. Canonicalizing the path for accurate reference pub fn to_disk_with_options, P: AsRef>( &mut self, path: P, @@ -21,9 +170,7 @@ impl OpenDictionary { file.write_all(&buf)?; file.flush()?; - self.path = canonicalize(path)? - .to_str() - .map(std::path::PathBuf::from); + self.path = canonicalize(path)?.to_str().map(std::path::PathBuf::from); Ok(()) } diff --git a/node/Cargo.toml b/node/Cargo.toml index 35cf7c86..981c1c47 100644 --- a/node/Cargo.toml +++ b/node/Cargo.toml @@ -5,7 +5,7 @@ version = "1.1.1" publish = false [lib] -crate-type = ["cdylib"] +crate-type = ["cdylib", "rlib"] [features] default = [] diff --git a/python/Cargo.toml b/python/Cargo.toml index 46437e8f..09ed73ff 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" [lib] name = "theopendictionary" -crate-type = ["cdylib"] +crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.27.2", features = ["either"] } diff --git a/python/src/dictionary.rs b/python/src/dictionary.rs index 993a331f..219e6285 100644 --- a/python/src/dictionary.rs +++ b/python/src/dictionary.rs @@ -9,6 +9,16 @@ use crate::{ utils::cast_error, }; +/// Compiles an ODXML string into binary `.odict` data. +/// +/// Takes an XML string conforming to the ODict XML schema and returns +/// the compiled binary representation as a byte vector. The resulting +/// bytes can be passed to [`OpenDictionary::new`] or saved to disk. +/// +/// # Errors +/// +/// Returns an error if the XML is malformed or does not conform to the +/// ODict schema. #[pyfunction] pub fn compile(xml: String) -> PyResult> { let bytes = xml @@ -19,6 +29,15 @@ pub fn compile(xml: String) -> PyResult> { Ok(bytes) } +/// The main class for working with compiled ODict dictionaries. +/// +/// An `OpenDictionary` wraps a compiled binary dictionary and provides +/// methods for looking up terms, full-text search, tokenization, and more. +/// +/// # Construction +/// +/// Create from compiled bytes or an XML string using [`OpenDictionary::new`], +/// or load from a file path or remote registry using [`OpenDictionary::load`]. #[pyclass] pub struct OpenDictionary { dict: odict::OpenDictionary, @@ -26,6 +45,11 @@ pub struct OpenDictionary { #[pymethods] impl OpenDictionary { + /// Loads a dictionary from a file path, alias, or remote identifier. + /// + /// This is an async method. If `dictionary` is a path to a `.odict` file, + /// it loads from disk. If it matches the format `org/lang` (e.g. `wiktionary/eng`), + /// it downloads from the remote registry. #[staticmethod] #[pyo3(signature = (dictionary, options=None))] pub fn load<'py>( @@ -50,6 +74,10 @@ impl OpenDictionary { }) } + /// Creates a dictionary from compiled binary data or directly from an XML string. + /// + /// Accepts either `bytes` (as returned by [`compile`]) or a `str` containing + /// ODXML markup. #[new] pub fn new(data: Either, String>) -> PyResult { let bytes = match data { @@ -60,6 +88,10 @@ impl OpenDictionary { Ok(Self { dict }) } + /// Saves the dictionary to disk as a `.odict` file. + /// + /// Optionally configure Brotli compression via `quality` (0–11) and + /// `window_size` (0–22). #[pyo3(signature = (path, quality=None, window_size=None))] pub fn save( &mut self, @@ -89,16 +121,24 @@ impl OpenDictionary { } } + /// The minimum rank value across all entries, or `None` if no entries have ranks. #[getter] pub fn min_rank(&self) -> PyResult> { Ok(self.dict.contents().map_err(cast_error)?.min_rank()) } + /// The maximum rank value across all entries, or `None` if no entries have ranks. #[getter] pub fn max_rank(&self) -> PyResult> { Ok(self.dict.contents().map_err(cast_error)?.max_rank()) } + /// Looks up one or more terms by exact match. + /// + /// - `query` — a single term or list of terms to look up. + /// - `split` — minimum word length for compound splitting. + /// - `follow` — follow `see_also` cross-references until an entry with etymologies is found. + /// - `insensitive` — enable case-insensitive matching. #[pyo3(signature = (query, split=None, follow=None, insensitive=None))] pub fn lookup( &self, @@ -135,6 +175,7 @@ impl OpenDictionary { Ok(mapped) } + /// Returns all terms defined in the dictionary, sorted alphabetically. pub fn lexicon(&self) -> PyResult> { let dict = self.dict.contents().map_err(cast_error)?; let lexicon = dict.lexicon(); @@ -142,6 +183,9 @@ impl OpenDictionary { Ok(lexicon) } + /// Creates a full-text search index for the dictionary. + /// + /// Must be called before [`OpenDictionary::search`]. #[pyo3(signature = (options=None))] pub fn index(&self, options: Option) -> PyResult<()> { let dict = self.dict.contents().map_err(cast_error)?; @@ -153,6 +197,9 @@ impl OpenDictionary { Ok(()) } + /// Runs a full-text search across the dictionary. + /// + /// Requires an index — call [`OpenDictionary::index`] first. #[pyo3(signature = (query, options=None))] pub fn search(&self, query: String, options: Option) -> PyResult> { let dict = self.dict.contents().map_err(cast_error)?; @@ -170,6 +217,14 @@ impl OpenDictionary { Ok(entries) } + /// Tokenizes text using NLP-based segmentation and matches each token against the dictionary. + /// + /// Supports Chinese, Japanese, Korean, Thai, Khmer, German, Swedish, + /// and Latin-script languages. + /// + /// - `text` — the text to tokenize. + /// - `follow` — follow `see_also` cross-references. Accepts `True`/`False` or a number (nonzero = follow). + /// - `insensitive` — enable case-insensitive matching. #[pyo3(signature = (text, follow=None, insensitive=None))] pub fn tokenize( &self, diff --git a/python/src/types/definition.rs b/python/src/types/definition.rs index 5094c218..e3c2495f 100644 --- a/python/src/types/definition.rs +++ b/python/src/types/definition.rs @@ -3,16 +3,23 @@ use structural_convert::StructuralConvert; use super::{note::Note, Example}; +/// A single definition of a word sense. +/// +/// Contains the definition text along with optional examples and notes. #[pyclass] #[derive(Debug, Clone, StructuralConvert)] #[convert(from(odict::schema::Definition))] pub struct Definition { + /// Optional identifier for this definition. #[pyo3(get)] pub id: Option, + /// The definition text. #[pyo3(get)] pub value: String, + /// Usage examples illustrating this definition. #[pyo3(get)] pub examples: Vec, + /// Additional notes about this definition. #[pyo3(get)] pub notes: Vec, } diff --git a/python/src/types/entry.rs b/python/src/types/entry.rs index 23b06085..78dbf4c6 100644 --- a/python/src/types/entry.rs +++ b/python/src/types/entry.rs @@ -6,18 +6,27 @@ use crate::utils::cast_error; use super::etymology::Etymology; use super::media_url::MediaURL; +/// A dictionary entry representing a single headword and its associated data. +/// +/// Each entry contains the term itself, optional ranking metadata, +/// cross-reference information, etymologies, and media attachments. #[pyclass] #[derive(Debug, Clone, StructuralConvert)] #[convert(from(odict::schema::Entry))] pub struct Entry { + /// The headword for this entry. #[pyo3(get)] pub term: String, + /// Optional frequency rank for ordering entries. #[pyo3(get)] pub rank: Option, + /// Cross-reference target term, if this entry redirects to another. #[pyo3(get)] pub see_also: Option, + /// The etymologies associated with this entry. #[pyo3(get)] pub etymologies: Vec, + /// Media URLs (audio, images, etc.) associated with this entry. #[pyo3(get)] pub media: Vec, } diff --git a/python/src/types/enums.rs b/python/src/types/enums.rs index 913cddf3..6b365cf2 100644 --- a/python/src/types/enums.rs +++ b/python/src/types/enums.rs @@ -1,16 +1,23 @@ use pyo3::prelude::*; use structural_convert::StructuralConvert; +/// A wrapper for ODict enumeration values (e.g. part of speech, pronunciation kind). +/// +/// ODict enums are represented as string triples: the enum name, +/// the variant name, and the variant's string value. #[pyclass] #[derive(Debug, PartialEq, Clone, StructuralConvert)] #[convert(from(internal::EnumWrapper))] pub struct EnumWrapper { + /// The enum type name (e.g. `"PartOfSpeech"`). #[pyo3(get)] pub name: String, + /// The variant name (e.g. `"Noun"`). #[pyo3(get)] pub variant: String, + /// The string value of the variant (e.g. `"n"`). #[pyo3(get)] pub value: String, } diff --git a/python/src/types/etymology.rs b/python/src/types/etymology.rs index bf06b166..f3a1b540 100644 --- a/python/src/types/etymology.rs +++ b/python/src/types/etymology.rs @@ -5,15 +5,23 @@ use pyo3::prelude::*; use super::pronunciation::Pronunciation; use super::sense::Sense; +/// An etymology grouping for a dictionary entry. +/// +/// Etymologies group together senses that share a common word origin. +/// Each etymology can have its own pronunciations and description. #[pyclass] #[derive(Clone)] pub struct Etymology { + /// Optional identifier for this etymology. #[pyo3(get)] pub id: Option, + /// Pronunciations associated with this etymology. #[pyo3(get)] pub pronunciations: Vec, + /// Optional description of the word origin. #[pyo3(get)] pub description: Option, + /// The senses (meanings) under this etymology. #[pyo3(get)] pub senses: Vec, } diff --git a/python/src/types/example.rs b/python/src/types/example.rs index 10047079..a2615f4f 100644 --- a/python/src/types/example.rs +++ b/python/src/types/example.rs @@ -3,16 +3,22 @@ use crate::types::{Pronunciation, Translation}; use pyo3::prelude::*; use structural_convert::StructuralConvert; +/// A usage example illustrating a definition. +/// +/// Examples can optionally include translations and pronunciations. #[pyclass] #[derive(Debug, Clone, StructuralConvert)] #[convert(from(odict::schema::Example))] pub struct Example { + /// The example text. #[pyo3(get)] pub value: String, + /// Translations of this example into other languages. #[pyo3(get)] pub translations: Vec, + /// Pronunciations for this example. #[pyo3(get)] pub pronunciations: Vec, } diff --git a/python/src/types/form.rs b/python/src/types/form.rs index 198527c8..5e1862ca 100644 --- a/python/src/types/form.rs +++ b/python/src/types/form.rs @@ -3,15 +3,22 @@ use pyo3::prelude::*; use super::enums::EnumWrapper; +/// An inflected or alternate form of a word. +/// +/// Forms represent morphological variants such as plurals, conjugations, +/// or other inflections. #[pyclass] #[derive(Clone, Debug)] pub struct Form { + /// The inflected form text. #[pyo3(get)] pub term: String, + /// The kind of form (e.g. plural, past tense), or `None`. #[pyo3(get, set)] pub kind: Option, + /// Tags for categorizing this form. #[pyo3(get)] pub tags: Vec, } diff --git a/python/src/types/group.rs b/python/src/types/group.rs index 60c9b265..7f9c951b 100644 --- a/python/src/types/group.rs +++ b/python/src/types/group.rs @@ -3,14 +3,21 @@ use structural_convert::StructuralConvert; use super::definition::Definition; +/// A named group of related definitions. +/// +/// Groups allow organizing multiple definitions under a shared description, +/// such as grouping definitions by semantic domain. #[pyclass] #[derive(Debug, Clone, StructuralConvert)] #[convert(from(odict::schema::Group))] pub struct Group { + /// Optional identifier for this group. #[pyo3(get)] pub id: Option, + /// A description of what this group of definitions has in common. #[pyo3(get)] pub description: String, + /// The definitions within this group. #[pyo3(get)] pub definitions: Vec, } diff --git a/python/src/types/index.rs b/python/src/types/index.rs index 535bdb0b..ab21daef 100644 --- a/python/src/types/index.rs +++ b/python/src/types/index.rs @@ -1,14 +1,18 @@ use pyo3::prelude::*; +/// Options for configuring full-text index creation. #[pyclass] #[derive(Clone)] pub struct IndexOptions { + /// Custom directory for storing the index. #[pyo3(get, set)] pub directory: Option, + /// Memory arena size per thread in bytes (must be >15 MB). #[pyo3(get, set)] pub memory: Option, + /// Whether to overwrite an existing index. #[pyo3(get, set)] pub overwrite: Option, } diff --git a/python/src/types/load.rs b/python/src/types/load.rs index 51ee8d98..b2f0014c 100644 --- a/python/src/types/load.rs +++ b/python/src/types/load.rs @@ -1,12 +1,16 @@ use pyo3::prelude::*; +/// Options for loading dictionaries from remote registries. #[pyclass] #[derive(PartialEq, Default, Clone, Eq)] pub struct RemoteLoadOptions { + /// Custom output directory for downloaded files. #[pyo3(get, set)] pub out_dir: Option, + /// Whether to cache downloaded dictionaries locally. #[pyo3(get, set)] pub caching: Option, + /// Number of download retries on failure. #[pyo3(get, set)] pub retries: Option, } @@ -20,11 +24,14 @@ impl RemoteLoadOptions { } } +/// Options for loading a dictionary from a file path, alias, or remote registry. #[pyclass] #[derive(PartialEq, Default, Clone, Eq)] pub struct LoadOptions { + /// Custom configuration directory. #[pyo3(get, set)] pub config_dir: Option, + /// Options for remote dictionary loading. #[pyo3(get, set)] pub remote: Option, } diff --git a/python/src/types/lookup.rs b/python/src/types/lookup.rs index 55c5b1ff..4888abca 100644 --- a/python/src/types/lookup.rs +++ b/python/src/types/lookup.rs @@ -2,15 +2,19 @@ use pyo3::prelude::*; use super::Entry; +/// Options for configuring term lookups. #[pyclass] #[derive(Clone)] pub struct LookupOptions { + /// Minimum word length for compound splitting. #[pyo3(get, set)] pub split: Option, + /// Whether to follow `see_also` cross-references. #[pyo3(get, set)] pub follow: Option, + /// Whether to enable case-insensitive matching. #[pyo3(get, set)] pub insensitive: Option, } @@ -58,11 +62,17 @@ impl From for odict::lookup::LookupOptions { } } +/// The result of a dictionary lookup. +/// +/// Contains the matched entry and, if a `see_also` redirect was followed, +/// the original entry that initiated the redirect. #[pyclass] #[derive(Debug, Clone)] pub struct LookupResult { + /// The matched dictionary entry. #[pyo3(get)] pub entry: Entry, + /// The original entry if a `see_also` redirect was followed, or `None`. #[pyo3(get)] pub directed_from: Option, } diff --git a/python/src/types/media_url.rs b/python/src/types/media_url.rs index f77fbf86..296e00df 100644 --- a/python/src/types/media_url.rs +++ b/python/src/types/media_url.rs @@ -2,16 +2,20 @@ use pyo3::prelude::*; use std::fmt; use structural_convert::StructuralConvert; +/// A reference to an external media resource (audio, image, etc.). #[pyclass] #[derive(Clone, Debug, StructuralConvert)] #[convert(from(odict::schema::MediaURL))] pub struct MediaURL { + /// URL or path to the media file. #[pyo3(get)] pub src: String, + /// MIME type (e.g. `audio/mpeg`), or `None`. #[pyo3(get)] pub mime_type: Option, + /// Human-readable description of the media. #[pyo3(get)] pub description: Option, } diff --git a/python/src/types/note.rs b/python/src/types/note.rs index 811d7706..82499923 100644 --- a/python/src/types/note.rs +++ b/python/src/types/note.rs @@ -3,14 +3,21 @@ use structural_convert::StructuralConvert; use super::Example; +/// An additional note attached to a definition. +/// +/// Notes provide supplementary information such as usage guidance, +/// historical context, or grammatical remarks. #[pyclass] #[derive(Debug, Clone, StructuralConvert)] #[convert(from(odict::schema::Note))] pub struct Note { + /// Optional identifier for this note. #[pyo3(get)] pub id: Option, + /// The note text. #[pyo3(get)] pub value: String, + /// Examples associated with this note. #[pyo3(get)] pub examples: Vec, } diff --git a/python/src/types/pronunciation.rs b/python/src/types/pronunciation.rs index 8762971a..f6100f04 100644 --- a/python/src/types/pronunciation.rs +++ b/python/src/types/pronunciation.rs @@ -5,15 +5,22 @@ use super::media_url::MediaURL; use internal::ToEnumWrapper; +/// A pronunciation entry for a word or etymology. +/// +/// Represents how a word is pronounced in a given notation system +/// (e.g. IPA, Pinyin), with optional audio media. #[pyclass] #[derive(Clone, Debug)] pub struct Pronunciation { + /// The pronunciation system (e.g. IPA, Pinyin), or `None`. #[pyo3(get)] pub kind: Option, + /// The pronunciation notation string. #[pyo3(get)] pub value: String, + /// Audio media URLs for this pronunciation. #[pyo3(get)] pub media: Vec, } diff --git a/python/src/types/save.rs b/python/src/types/save.rs index 28a67eb3..424f4e1f 100644 --- a/python/src/types/save.rs +++ b/python/src/types/save.rs @@ -1,11 +1,14 @@ use pyo3::prelude::*; +/// Brotli compression options for saving dictionaries. #[pyclass] #[derive(PartialEq, Default, Clone, Eq)] pub struct CompressOptions { + /// Compression quality level (0–11). #[pyo3(get, set)] pub quality: Option, - + + /// Compression window size (0–22). #[pyo3(get, set)] pub window_size: Option, } @@ -22,9 +25,11 @@ impl CompressOptions { } } +/// Options for saving a dictionary to disk. #[pyclass] #[derive(PartialEq, Default, Clone, Eq)] pub struct SaveOptions { + /// Optional Brotli compression settings. #[pyo3(get, set)] pub compress: Option, } diff --git a/python/src/types/search.rs b/python/src/types/search.rs index 6e24b897..b893f5c6 100644 --- a/python/src/types/search.rs +++ b/python/src/types/search.rs @@ -1,17 +1,22 @@ use pyo3::prelude::*; +/// Options for configuring full-text search. #[pyclass] #[derive(Clone)] pub struct SearchOptions { + /// Custom directory for the search index. #[pyo3(get, set)] pub directory: Option, + /// Relevance score threshold for filtering results. #[pyo3(get, set)] pub threshold: Option, + /// Whether to automatically create an index if one does not exist. #[pyo3(get, set)] pub autoindex: Option, + /// Maximum number of results to return. #[pyo3(get, set)] pub limit: Option, } diff --git a/python/src/types/sense.rs b/python/src/types/sense.rs index 29b736dc..d909f42f 100644 --- a/python/src/types/sense.rs +++ b/python/src/types/sense.rs @@ -7,19 +7,30 @@ use super::{ definition::Definition, enums::EnumWrapper, form::Form, group::Group, translation::Translation, }; +/// A word sense — a specific meaning grouped by part of speech. +/// +/// Senses represent distinct meanings of a word under a given etymology. +/// Each sense has a part of speech and contains definitions (or definition groups), +/// along with optional tags, translations, and inflected forms. #[pyclass] #[derive(Debug, Clone)] pub struct Sense { + /// The part of speech for this sense (e.g. noun, verb, adjective). #[pyo3(get)] pub pos: EnumWrapper, + /// Optional lemma reference linking to another entry. #[pyo3(get)] pub lemma: Option, + /// Definitions or definition groups under this sense. #[pyo3(get)] pub definitions: Vec>, + /// Tags for categorizing or filtering this sense. #[pyo3(get)] pub tags: Vec, + /// Translations of this sense into other languages. #[pyo3(get)] pub translations: Vec, + /// Inflected forms of the word under this sense. #[pyo3(get)] pub forms: Vec
, } diff --git a/python/src/types/token.rs b/python/src/types/token.rs index ff0fe16f..9da91ba2 100644 --- a/python/src/types/token.rs +++ b/python/src/types/token.rs @@ -3,21 +3,32 @@ use pyo3::prelude::*; use super::LookupResult; +/// A token produced by NLP-based text segmentation. +/// +/// Each token represents a segment of the input text, with metadata about +/// its position, detected language and script, and any matching dictionary entries. #[pyclass] #[derive(Debug)] pub struct Token { + /// The original token text (lemma form). #[pyo3(get)] pub lemma: String, + /// Detected language code (e.g. `"eng"`), or `None` if unknown. #[pyo3(get)] pub language: Option, + /// Matched dictionary entries for this token. #[pyo3(get)] pub entries: Vec, + /// The token kind (e.g. `"Word"`, `"Punctuation"`). #[pyo3(get)] pub kind: String, + /// Detected script name (e.g. `"Latin"`, `"Han"`). #[pyo3(get)] pub script: String, + /// Start byte offset in the original text. #[pyo3(get)] pub start: usize, + /// End byte offset in the original text. #[pyo3(get)] pub end: usize, } diff --git a/python/src/types/tokenize.rs b/python/src/types/tokenize.rs index fdcc2fe1..43cf11af 100644 --- a/python/src/types/tokenize.rs +++ b/python/src/types/tokenize.rs @@ -1,12 +1,15 @@ use either::Either; use pyo3::prelude::*; +/// Options for configuring text tokenization. #[pyclass] #[derive(Clone)] pub struct TokenizeOptions { + /// Whether to follow `see_also` cross-references. Accepts `True`/`False` or a number (nonzero = follow). #[pyo3(get, set)] pub follow: Option>, + /// Whether to enable case-insensitive matching. #[pyo3(get, set)] pub insensitive: Option, } diff --git a/python/src/types/translation.rs b/python/src/types/translation.rs index b0dcede2..37663d49 100644 --- a/python/src/types/translation.rs +++ b/python/src/types/translation.rs @@ -1,13 +1,16 @@ use pyo3::prelude::*; use structural_convert::StructuralConvert; +/// A translation of a word, definition, or example into another language. #[pyclass] #[derive(Debug, Clone, StructuralConvert)] #[convert(from(odict::schema::Translation))] pub struct Translation { + /// The BCP-47 language code (e.g. `"fra"`, `"deu"`). #[pyo3(get)] pub lang: String, + /// The translated text. #[pyo3(get)] pub value: String, } diff --git a/scripts/rustdoc-to-md.py b/scripts/rustdoc-to-md.py new file mode 100755 index 00000000..7d29bb6c --- /dev/null +++ b/scripts/rustdoc-to-md.py @@ -0,0 +1,378 @@ +#!/usr/bin/env python3 +""" +Extracts Rust doc comments (///) from source files and generates Markdown documentation. + +This script parses Rust source files for structs, functions, and their fields/methods, +then formats the extracted documentation as Markdown suitable for inclusion in a +documentation site. + +Usage: + python scripts/rustdoc-to-md.py python/src # Generate docs for Python bindings + python scripts/rustdoc-to-md.py node/src # Generate docs for Node bindings + +Alternatively, use cargo rustdoc with JSON output and the `rustdoc-md` tool: + RUSTC_BOOTSTRAP=1 cargo rustdoc -p odict_python -- -Z unstable-options --output-format json + rustdoc-md target/doc/theopendictionary.json -o docs/python-api.md +""" + +import re +import sys +from dataclasses import dataclass, field +from pathlib import Path + + +@dataclass +class DocItem: + """A documented item extracted from Rust source.""" + name: str + kind: str # "struct", "function", "method", "field", "getter" + doc: str + signature: str = "" + fields: list = field(default_factory=list) + methods: list = field(default_factory=list) + + +def extract_doc_comment(lines: list[str], end_idx: int) -> str: + """Extract consecutive /// doc comments ending at or before the given line. + + Skips over attribute lines (#[...]) to find the doc comment block. + """ + # First, skip backwards over attribute lines + i = end_idx + while i >= 0: + stripped = lines[i].strip() + if stripped.startswith("///"): + break + elif stripped.startswith("#[") or stripped == "": + i -= 1 + continue + else: + return "" + i -= 1 + + # Now extract the doc comment lines + doc_lines = [] + while i >= 0 and lines[i].strip().startswith("///"): + comment = lines[i].strip().removeprefix("///") + # Preserve leading space for indented content, but strip the first space + if comment.startswith(" "): + comment = comment[1:] + doc_lines.append(comment) + i -= 1 + doc_lines.reverse() + return "\n".join(doc_lines).strip() + + +def parse_rust_file(filepath: Path) -> list[DocItem]: + """Parse a Rust file and extract documented items.""" + content = filepath.read_text() + lines = content.splitlines() + items = [] + + i = 0 + while i < len(lines): + line = lines[i].strip() + + # Detect #[pyfunction] + if line == "#[pyfunction]": + doc = extract_doc_comment(lines, i - 1) + # Find the function signature + j = i + 1 + while j < len(lines) and not lines[j].strip().startswith("pub fn "): + j += 1 + if j < len(lines): + sig = extract_fn_signature(lines, j) + name = re.search(r"pub fn (\w+)", lines[j]) + if name: + items.append(DocItem( + name=name.group(1), + kind="function", + doc=doc, + signature=sig, + )) + + # Detect #[pyclass] or #[napi(object)] + if line == "#[pyclass]" or line.startswith("#[pyclass") or line.startswith("#[napi"): + # Only match struct-level napi, not method-level + if line.startswith("#[napi") and "object" not in line and "constructor" not in line: + i += 1 + continue + + doc = extract_doc_comment(lines, i - 1) + # Find the struct name + j = i + 1 + while j < len(lines) and not lines[j].strip().startswith("pub struct "): + j += 1 + if j < len(lines): + name_match = re.search(r"pub struct (\w+)", lines[j]) + if name_match: + struct_name = name_match.group(1) + struct_item = DocItem( + name=struct_name, + kind="struct", + doc=doc, + ) + # Extract fields from struct body + if lines[j].strip().endswith("{"): + k = j + 1 + while k < len(lines) and not lines[k].strip().startswith("}"): + field_line = lines[k].strip() + if field_line.startswith("pub "): + # Look back for doc comment, skipping #[pyo3(...)] attrs + field_doc = extract_doc_comment(lines, k - 1) + field_match = re.match( + r"pub\s+(\w+):\s*(.+?),?\s*$", field_line + ) + if field_match: + struct_item.fields.append(DocItem( + name=field_match.group(1), + kind="field", + doc=field_doc, + signature=field_match.group(2), + )) + k += 1 + items.append(struct_item) + + # Detect #[pymethods] impl blocks + if line == "#[pymethods]": + j = i + 1 + while j < len(lines) and not lines[j].strip().startswith("impl "): + j += 1 + if j < len(lines): + impl_match = re.search(r"impl (\w+)", lines[j]) + if impl_match: + impl_name = impl_match.group(1) + # Find the matching struct in items + target = None + for item in items: + if item.name == impl_name and item.kind == "struct": + target = item + break + + # Parse methods in the impl block + brace_depth = 0 + k = j + while k < len(lines): + if "{" in lines[k]: + brace_depth += lines[k].count("{") + if "}" in lines[k]: + brace_depth -= lines[k].count("}") + if brace_depth == 0 and k > j: + break + + mline = lines[k].strip() + + # Check for pub fn (but skip dunder methods) + if mline.startswith("pub fn ") and "__" not in mline: + # Extract doc comment, skipping attribute lines + method_doc = extract_doc_comment(lines, k - 1) + + # Check for #[getter] in preceding attribute lines + is_getter = False + is_staticmethod = False + is_new = False + for back in range(max(0, k - 10), k): + attr = lines[back].strip() + if attr == "#[getter]": + is_getter = True + elif attr == "#[staticmethod]": + is_staticmethod = True + elif attr == "#[new]": + is_new = True + elif attr.startswith("pub fn ") or (attr.startswith("///") and back < k - 1): + break # stop looking back past another function or doc + + sig = extract_fn_signature(lines, k) + + name_match = re.search(r"pub fn (\w+)", mline) + if name_match and method_doc: + kind = "getter" if is_getter else "method" + if is_new: + kind = "constructor" + + method = DocItem( + name=name_match.group(1), + kind=kind, + doc=method_doc, + signature=sig, + ) + if target: + target.methods.append(method) + k += 1 + + i += 1 + + return items + + +def extract_fn_signature(lines: list[str], start: int) -> str: + """Extract a function signature starting from the given line.""" + sig_lines = [] + paren_depth = 0 + i = start + while i < len(lines): + line = lines[i] + sig_lines.append(line.rstrip()) + paren_depth += line.count("(") - line.count(")") + if paren_depth <= 0 and ")" in line: + # Check for return type on next line + if i + 1 < len(lines) and lines[i + 1].strip().startswith("->"): + sig_lines.append(lines[i + 1].rstrip()) + break + i += 1 + + sig = " ".join(l.strip() for l in sig_lines) + # Clean up: extract just the function part + match = re.search(r"(pub fn \w+.*?)(?:\s*\{|\s*where)", sig) + if match: + return match.group(1).strip() + match = re.search(r"(pub fn \w+[^{]*)", sig) + if match: + return match.group(1).strip().rstrip("{").strip() + return sig + + +def rust_type_to_display(ty: str) -> str: + """Convert a Rust type to a more readable display format.""" + ty = ty.strip().rstrip(",") + # Option -> T | None + m = re.match(r"Option<(.+)>$", ty) + if m: + inner = rust_type_to_display(m.group(1)) + return f"{inner} | None" + # Vec -> list[T] + m = re.match(r"Vec<(.+)>$", ty) + if m: + inner = rust_type_to_display(m.group(1)) + return f"list[{inner}]" + # Either -> A | B + m = re.match(r"Either<(.+),\s*(.+)>$", ty) + if m: + a = rust_type_to_display(m.group(1)) + b = rust_type_to_display(m.group(2)) + return f"{a} | {b}" + # Basic type mappings + mappings = { + "String": "str", + "&str": "str", + "u32": "int", + "u64": "int", + "i32": "int", + "i64": "int", + "usize": "int", + "bool": "bool", + "f32": "float", + "f64": "float", + } + return mappings.get(ty, ty) + + +def items_to_markdown(items: list[DocItem], title: str) -> str: + """Convert extracted items to Markdown format.""" + md = [] + md.append(f"# {title}\n") + md.append("*Auto-generated from Rust doc comments.*\n") + md.append("---\n") + + # Separate functions and structs + functions = [i for i in items if i.kind == "function"] + structs = [i for i in items if i.kind == "struct"] + + if functions: + md.append("## Functions\n") + for func in functions: + md.append(f"### `{func.name}()`\n") + if func.doc: + md.append(f"{func.doc}\n") + + for struct in structs: + md.append(f"## `{struct.name}`\n") + if struct.doc: + md.append(f"{struct.doc}\n") + + # Constructors + constructors = [m for m in struct.methods if m.kind == "constructor"] + if constructors: + md.append("### Constructors\n") + for c in constructors: + md.append(f"#### `{struct.name}()`\n") + if c.doc: + md.append(f"{c.doc}\n") + + # Static methods + statics = [m for m in struct.methods if m.kind == "static"] + for s in statics: + md.append(f"### `{struct.name}.{s.name}()`\n") + if s.doc: + md.append(f"{s.doc}\n") + + # Properties (fields + getters) + getters = [m for m in struct.methods if m.kind == "getter"] + if struct.fields or getters: + md.append("### Properties\n") + md.append("| Property | Type | Description |") + md.append("|----------|------|-------------|") + for f in struct.fields: + ty = rust_type_to_display(f.signature) + doc = f.doc.replace("\n", " ") if f.doc else "" + md.append(f"| `{f.name}` | `{ty}` | {doc} |") + for g in getters: + doc = g.doc.replace("\n", " ") if g.doc else "" + md.append(f"| `{g.name}` | — | {doc} |") + md.append("") + + # Methods (non-getter, non-constructor) + methods = [m for m in struct.methods if m.kind == "method"] + if methods: + md.append("### Methods\n") + for m in methods: + md.append(f"#### `{m.name}()`\n") + if m.doc: + md.append(f"{m.doc}\n") + + md.append("---\n") + + return "\n".join(md) + + +def main(): + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} [output_file]") + print(f"Example: {sys.argv[0]} python/src docs/python-api-generated.md") + sys.exit(1) + + src_dir = Path(sys.argv[1]) + output_file = sys.argv[2] if len(sys.argv) > 2 else None + + if not src_dir.exists(): + print(f"Error: directory {src_dir} does not exist") + sys.exit(1) + + # Collect all .rs files + rs_files = sorted(src_dir.rglob("*.rs")) + all_items = [] + + for rs_file in rs_files: + items = parse_rust_file(rs_file) + all_items.extend(items) + + if not all_items: + print("No documented items found.") + sys.exit(1) + + # Determine title from directory name + dir_name = src_dir.parent.name if src_dir.name == "src" else src_dir.name + title_map = {"python": "Python API", "node": "JavaScript API"} + title = title_map.get(dir_name, f"{dir_name} API") + + md = items_to_markdown(all_items, title) + + if output_file: + Path(output_file).write_text(md) + print(f"Generated {output_file} ({len(all_items)} items)") + else: + print(md) + + +if __name__ == "__main__": + main()