diff --git a/.github/pages/index.html b/.github/pages/index.html index 14dc65a..2a7e301 100644 --- a/.github/pages/index.html +++ b/.github/pages/index.html @@ -102,6 +102,13 @@

Work in Progress

href="https://github.com/clarin-eric/fcs-misc/tree/main/fcs-aai" rel="noopener" target="_blank">Sources +
  • + + + LexFCS 1.0: HTML, PDF, Sources +
  • diff --git a/.github/workflows/build-lexfcs-adoc.yml b/.github/workflows/build-lexfcs-adoc.yml new file mode 100644 index 0000000..bca8ece --- /dev/null +++ b/.github/workflows/build-lexfcs-adoc.yml @@ -0,0 +1,37 @@ +name: build adocs + +on: + push: + branches: + - main + paths: + - 'lexfcs/**' + - '.github/workflows/build-lexfcs-adoc.yml' + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-latest + container: asciidoctor/docker-asciidoctor + + steps: + - uses: actions/checkout@v4 + + - name: Build HTML + run: asciidoctor -v -D docs -a data-uri --backend=html5 -o lexfcs.html lexfcs/index.adoc + + - name: Build PDF + run: asciidoctor-pdf -v -D docs -o lexfcs.pdf lexfcs/index.adoc + + - name: Copy attachments + run: cp -R lexfcs/attachments docs/ + + - name: Store results + uses: actions/upload-artifact@v4 + with: + name: lexfcs-specs + path: docs/* diff --git a/.github/workflows/publish-gh-pages.yml b/.github/workflows/publish-gh-pages.yml index c00c907..753a3d5 100644 --- a/.github/workflows/publish-gh-pages.yml +++ b/.github/workflows/publish-gh-pages.yml @@ -91,6 +91,28 @@ jobs: name: fcs-aai-specs path: docs/* + lexfcs: + runs-on: ubuntu-latest + container: asciidoctor/docker-asciidoctor + + steps: + - uses: actions/checkout@v4 + + - name: Build HTML + run: asciidoctor -v -D docs -a data-uri --backend=html5 -o lexfcs.html lexfcs/index.adoc + + - name: Build PDF + run: asciidoctor-pdf -v -D docs -o lexfcs.pdf lexfcs/index.adoc + + - name: Copy attachments + run: cp -R lexfcs/attachments docs/ + + - name: Store results + uses: actions/upload-artifact@v4 + with: + name: lexfcs-specs + path: docs/* + fcs-endpoint-dev-slides: runs-on: ubuntu-latest container: asciidoctor/docker-asciidoctor @@ -149,10 +171,11 @@ jobs: runs-on: ubuntu-latest needs: [ - fcs-aai, - fcs-core-1-0, fcs-core-2-0, + fcs-core-1-0, fcs-dataviews-1-0, + fcs-aai, + lexfcs, fcs-endpoint-dev-slides, fcs-endpoint-dev-slides-as-doc, ] diff --git a/README.md b/README.md index 9ac5625..38d1508 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ Jump to: [[Specification Documents](#specification-documents)] - [CLARIN Federated Content Search - FCS **Core 1.0**: `fcs-core-1.0/index.adoc`](fcs-core-1.0/index.adoc) - [CLARIN Federated Content Search - FCS **Data Views 1.0**: `fcs-dataviews-1.0/index.adoc`](fcs-dataviews-1.0/index.adoc) - _WIP_ [CLARIN Federated Content Search - FCS **AAI 1.0**: `fcs-aai/index.adoc`](fcs-aai/index.adoc) +- _WIP_ [Text+ **LexFCS 1.0**: `lexfcs/index.adoc`](lexfcs/index.adoc) ### Folder Structure diff --git a/lexfcs/attachments/DataView-LexHits.xsd b/lexfcs/attachments/DataView-LexHits.xsd new file mode 100644 index 0000000..a09b705 --- /dev/null +++ b/lexfcs/attachments/DataView-LexHits.xsd @@ -0,0 +1,85 @@ + + + + + + + This schema defines the structure of a + generic result data view. All CLARIN-FCS endpoints + MUST support this data view. + + + The value application/x-clarin-fcs-hits+xml + MUST be used to indicate a generic result data view. + + + + + + + + + A single result line with one or more marked hits. + White-space is considered non-signification, + except for delimiting tokens. + + + CLARIN-FCS client MAY + normalize white-space and strip leading and tailing + white-space and collapse all white-space between + tokens to a single #x20 character. + + + + + + + + + + A hit highlight or a field type annotation. It SHALL not be empty. + + + One Result element MUST + one Hit element, but MAY + contain more than one. + + + + + + + + + + + + + + + Field type identifier for this annotation. Is used in the <Hit> element to determine which function the annotated text has. + + + + + + + + + + + + + + + + diff --git a/lexfcs/attachments/lexhits-example.xml b/lexfcs/attachments/lexhits-example.xml new file mode 100644 index 0000000..52e1dda --- /dev/null +++ b/lexfcs/attachments/lexhits-example.xml @@ -0,0 +1,6 @@ + + + The quick brown fox jumps over the lazy dog. + diff --git a/lexfcs/dataviews.adoc b/lexfcs/dataviews.adoc new file mode 100644 index 0000000..6c59678 --- /dev/null +++ b/lexfcs/dataviews.adoc @@ -0,0 +1,40 @@ += LexFCS Data Views +:description: FCS DataViews for Lex Search. + + +Data formats for result representation. + + +== Extension of the Hits Data View for LexFCS + +Based on: + +* <> (section "Basic Search", §2.2.3.2) +* Hits XML schema https://github.com/clarin-eric/fcs-misc/blob/main/schema/Core_2/DataView-Hits.xsd["DataView-Hits.xsd"] + +.Example of basic *Hits* Data View +[source,xml] +---- + + + The quick brown fox jumps over the lazydog. + +---- + +Reuse of the `` element, with the extension of content hinting by using an optional attribute `@kind` with the following allowed values: + +* `lex-lemma`: Lemma, +* `lex-pos`: Part of speech, +* `lex-def`: Definition. + +Textual content outside of `` are displayed unchanged. + +.Example of extended *Hits* Data View with additional `@kind` attributes +[source,xml] +---- + + Apple: NOUN. An apple is an edible fruit produced by an apple tree. + +---- + +Endpoints `MUST` generate responses that are valid according to the XML schema link:attachments/DataView-LexHits.xsd["DataView-LexHits.xsd"]. diff --git a/lexfcs/index.adoc b/lexfcs/index.adoc new file mode 100644 index 0000000..31a201b --- /dev/null +++ b/lexfcs/index.adoc @@ -0,0 +1,44 @@ += Federated Content Search for Lexical Resources (LexFCS): Specification +Erik Körner ; Thomas Eckart ; Axel Herold ; Frank Wiegand ; Frank Michaelis ; Matthias Bremm ; Louis Cotgrove ; Thorsten Trippel ; Felix Rau +v0.1, 2023-05-04 +// more metadata +:description: Specification extension of the CLARIN Federated Content Search (FCS) for Lexical Resources (LexFCS). +:organization: Text+ +// settings +:doctype: book +// source code +:source-highlighter: rouge +:rouge-style: igor_pro +// toc and heading +:toc: +:toclevels: 4 +:sectnums: +:sectnumlevels: 4 +:appendix-caption!: +// directory stuff +:imagesdir: images +// pdf +ifdef::backend-pdf[] +:pdf-theme: textplus +:pdf-themesdir: {docdir}/themes +:title-logo-image: image:{docdir}/themes/textplus-logo.svg[pdfwidth=3.25in,align=center] +endif::[] + +//ifdef::backend-pdf[] +//[%notitle] +//-- +//[abstract] +//{description} +//-- +//endif::[] + +include::introduction.adoc[leveloffset=+1] + +include::lexcql.adoc[leveloffset=+1] + +include::dataviews.adoc[leveloffset=+1] + +[appendix] +== Normative Appendix + +include::lexcql-contextset.adoc[leveloffset=+2] diff --git a/lexfcs/introduction.adoc b/lexfcs/introduction.adoc new file mode 100644 index 0000000..1a94c14 --- /dev/null +++ b/lexfcs/introduction.adoc @@ -0,0 +1,148 @@ += Introduction +:description: Introduction of LexFCS. + +The _Lexical Search for Federated Content Search (LexFCS)_ specification is an extension of the <> specification that allows search and retrieval of _lexical resources_ including dictionaries, encyclopedias, normative data, terminological databases, ontologies etc. + + +== Terminology + +The key words `MUST`, `MUST NOT`, `REQUIRED`, `SHALL`, `SHALL NOT`, `SHOULD`, `SHOULD NOT`, `RECOMMENDED`, `MAY`, and `OPTIONAL` in this document are to be interpreted as in <>. + + +== Glossary + +NOTE: Based on _Glossary_ in <> specification. + +Aggregator:: + A module or service to dispatch queries to repositories and collect results. + +CLARIN-FCS, FCS:: + CLARIN federated content search, an interface specification to allow searching within resource content of repositories. + +Client:: + A software component, which implements the interface specification to query Endpoints, i.e. an aggregator or a user-interface. + +CQL:: + Contextual Query Language, previously known as Common Query Language, is a domain specific language for representing queries to information retrieval systems such as search engines, bibliographic catalogs and museum collection information. + +Data View:: + A Data View is a mechanism to support different representations of search results, e.g. a "hits with highlights" view, an image or a geolocation. + +Endpoint:: + A software component, which implements the CLARIN-FCS interface specification and translates between CLARIN-FCS and a search engine. + +Hit:: + A piece of data returned by a Search Engine that matches the search criterion. What is considered a Hit highly depends on Search Engine. + +Interface Specification:: + Common harmonized interface and suite of protocols that repositories need to implement. + +PID:: + A Persistent identifier is a long-lasting reference to a digital object. + +Repository Registry:: + A separate service that allows registering Repositories and their Endpoints and provides information about these to other components, e.g. an Aggregator. The https://centres.clarin.eu/[CLARIN Center Registry] is an implementation of such a repository registry. + +Resource:: + A searchable and addressable entity at an Endpoint, such as a text corpus or a multi-modal corpus. + +Resource Fragment:: + A smaller unit in a Resource, e.g. a sentence in a text corpus or a time interval in an audio transcription. + +Result Set:: + An (ordered) set of hits that match a search criterion produced by a search engine as the result of processing a query. + +Search Engine:: + A software component within a repository that allows for searching within the repository contents. + +SRU:: + <> is a protocol for Internet search queries. Originally introduced by Library of Congress <>, later standardization process moved to OASIS <>, <>. + + +== Normative References + +NOTE: Based on _Normative References_ in <> specification. + +[[ref:RFC2119,RFC2119]]RFC2119:: + Key words for use in RFCs to Indicate Requirement Levels, IETF RFC 2119, March 1997, + https://www.ietf.org/rfc/rfc2119.html + +[[ref:XML-Namespaces]]XML-Namespaces:: + Namespaces in XML 1.0 (Third Edition), W3C, 8 December 2009, + http://www.w3.org/TR/2009/REC-xml-names-20091208/ + +[[ref:OASIS-SRU-Overview]]OASIS-SRU-Overview:: + searchRetrieve: Part 0. Overview Version 1.0, OASIS, January 2013, + http://docs.oasis-open.org/search-ws/searchRetrieve/v1.0/os/part0-overview/searchRetrieve-v1.0-os-part0-overview.html + http://docs.oasis-open.org/search-ws/searchRetrieve/v1.0/os/part0-overview/searchRetrieve-v1.0-os-part0-overview.doc[(DOC)], + http://docs.oasis-open.org/search-ws/searchRetrieve/v1.0/os/part0-overview/searchRetrieve-v1.0-os-part0-overview.pdf[(PDF)] + +[[ref:OASIS-SRU12]]OASIS-SRU12:: + searchRetrieve: Part 2. SRU searchRetrieve Operation: APD Binding for SRU 1.2 Version 1.0, OASIS, January 2013, + http://docs.oasis-open.org/search-ws/searchRetrieve/v1.0/os/part2-sru1.2/searchRetrieve-v1.0-os-part2-sru1.2.html + http://docs.oasis-open.org/search-ws/searchRetrieve/v1.0/os/part2-sru1.2/searchRetrieve-v1.0-os-part2-sru1.2.doc[(DOC)], + http://docs.oasis-open.org/search-ws/searchRetrieve/v1.0/os/part2-sru1.2/searchRetrieve-v1.0-os-part2-sru1.2.pdf[(PDF)] + +[[ref:OASIS-SRU20]]OASIS-SRU20:: + searchRetrieve: Part 3. SRU searchRetrieve Operation: APD Binding for SRU 2.0 Version 1.0, OASIS, January 2013, + http://docs.oasis-open.org/search-ws/searchRetrieve/v1.0/os/part3-sru2.0/searchRetrieve-v1.0-os-part3-sru2.0.html + http://docs.oasis-open.org/search-ws/searchRetrieve/v1.0/os/part3-sru2.0/searchRetrieve-v1.0-os-part3-sru2.0.doc[(DOC)], + http://docs.oasis-open.org/search-ws/searchRetrieve/v1.0/os/part3-sru2.0/searchRetrieve-v1.0-os-part3-sru2.0.pdf[(PDF)] + +[[ref:OASIS-CQL]]OASIS-CQL:: + searchRetrieve: Part 5. CQL: The Contextual Query Language version 1.0, OASIS, January 2013, + http://docs.oasis-open.org/search-ws/searchRetrieve/v1.0/os/part5-cql/searchRetrieve-v1.0-os-part5-cql.html + http://docs.oasis-open.org/search-ws/searchRetrieve/v1.0/os/part5-cql/searchRetrieve-v1.0-os-part5-cql.doc[(DOC)], + http://docs.oasis-open.org/search-ws/searchRetrieve/v1.0/os/part5-cql/searchRetrieve-v1.0-os-part5-cql.pdf[(PDF)] + +[[ref:LOC-SRU12]]LOC-SRU12:: + SRU Version 1.2: SRU Search/Retrieve Operation, Library of Congress, + http://www.loc.gov/standards/sru/sru-1-2.html + +[[ref:LOC-CQL]]LOC-CQL:: + The _Contextual Query Language_, Library of Congress, + https://www.loc.gov/standards/sru/cql/, + see also <> + +[[ref:LOC-CQLCS]]LOC-CQLCS:: + The _CQL Context Set_, Library of Congress, + https://www.loc.gov/standards/sru/cql/contextSets/theCqlContextSet.html + +//CLARIN-FCS-DataViews:: +// CLARIN Federated Content Search (CLARIN-FCS) - Data Views, SCCTC FCS Task-Force, April 2014, +// https://trac.clarin.eu/wiki/FCS/Dataviews, +// https://www.clarin.eu/sites/default/files/CE-2014-0317-CLARIN_FCS_Specification_DataViews_1_0.pdf + +[[ref:CLARIN-FCSCore20]]CLARIN-FCSCore20:: + CLARIN Federated Content Search (CLARIN-FCS) - Core 2.0 specification, SCCTC FCS Task-Force, + https://office.clarin.eu/v/CE-2017-1046-FCS-Specification-v89.pdf + + +== Non-Normative References + +[[ref:UD-POS]]UD-POS:: + Universal Dependencies, Universal POS tags v2.0, + https://universaldependencies.org/u/pos/ + + +== Typographic and XML Namespace conventions + +Sections that are still in discussion and not yet finalized are marked with `(WIP)` and may optionally have some _NOTE_ admonition blocks. Details and specifications `MUST NOT` be considered stable. + +The following typographic conventions for XML fragments will be used throughout this specification: + +* `` ++ +An XML element with the Generic Identifier _Element_ that is bound to an XML namespace denoted by the prefix _prefix_. + +* `@attr` ++ +An XML attribute with the name _attr_. + +* `string` ++ +The literal _string_ must be used either as element content or attribute value. + +Endpoints and Clients `MUST` adhere to the <> specification. The CLARIN-FCS interface specification generally does not dictate whether XML elements should be serialized in their prefixed or non-prefixed syntax, but Endpoints `MUST` ensure that the correct XML namespace is used for elements and that XML namespaces are declared correctly. Clients `MUST` be agnostic regarding syntax for serializing the XML elements, i.e. if the prefixed or un-prefixed variant was used, and `SHOULD` operate solely on _expanded names_, i.e. pairs of _namespace name_ and _local name_. + +For a list of common XML namespace names and prefixes see the table "XML Namespaces and prefixes" in section 1.5 of the <>. diff --git a/lexfcs/lexcql-contextset.adoc b/lexfcs/lexcql-contextset.adoc new file mode 100644 index 0000000..014ad35 --- /dev/null +++ b/lexfcs/lexcql-contextset.adoc @@ -0,0 +1,106 @@ += CQL ContextSet specification +:description: CQL ContextSet specification for LexCQL query language. + + +Used identifier: `\http://text-plus.org/cql/lexres/1.0/` (__draft__) + +Recommended prefix: `lexres` + +TIP: For more examples of CQL ContextSets, see the https://www.loc.gov/standards/sru/cql/contextSets/listOfContextSets.html[list of Context Sets] at the Library of Congress (LoC). + + +== Indexes + +TIP: For more information about CQL indexes, see <>. + + +[%header,width="100%",cols="1,2,2"] +|=== +| Index title +| Description +| Details/Open questions + +| `lemma` +| Lemma name +| Support of multiword expressions? + +| `pos` +| Part of Speech +| Use of <>. Allow custom tag sets? + +| `def` +| Definition +| + +| `xr$synonymy`, ... +| Semantic relations, analogous to https://dariah-eric.github.io/lexicalresources/pages/TEILex0/TEILex0.html#crossref_typology[types in TEI Lex0] +| Only _Synonym_, _Hyponym_, _Hyperonym_, _Meronym_, _Antonym_, or additional relations? + +| `senseRef` +| Sense, Entity, ... - URI/ID pointint to unique identifier, e.g. for disambiguation | Any URI allowed? Defined subsets (using prefixes)? Subdivision by type (e.g. synset/sense or entity, ...)? +|=== + + +== Relations + +TIP: More information about CQL relations can be found <>. + + +=== Implicit Relations + +* *`=`* ++ +Different functions based on index. +Suggested use: ++ +-- + * _Full match_ for `lemma` and `def`, + * for other indexes (like `pos`) use as _"contained in"_ (_Endpoint-dependent behaviour_). +-- + + +=== Defined Relations + +(potential definition of full/partial match in the future) + + +== Booleans + +TIP: For more information about Booleans in CQL, see <>. + +* *`AND`* +* *`OR`* +* *`NOT`* (__might be unnecessary__) +* *`PROX`* (__might be obsolete__) + + +== Relation Modifiers / Relation Qualifiers + +TIP: For more information about relation modifiers/qualifiers, see <>. + +*none* + +* *`=`* with `/contains`, `/startswith`, `/endswith`, `/fullmatch`, `/partialmatch` modifiers + + +== Boolean Modifiers + +TIP: More information about Boolean modifiers can be found <>. + +*none* + + +== Examples + +. `cat`, `"cat"`, `"United Nations"` + +Searching in the default index; `cql.word` or `lemma` + +. `pos = ADJ` + +Search for adjectives + +. `def = "cat"` + +Search for records whose definition contains the term "cat". + +TODO: Stemming, Lower/Uppercase, Subword-Matches + +. `pos = NOUN NOT lion AND def = carnivore` + +Search for nouns with "carnivore" in definition; exclusion of records with "lion" diff --git a/lexfcs/lexcql.adoc b/lexfcs/lexcql.adoc new file mode 100644 index 0000000..93da6f0 --- /dev/null +++ b/lexfcs/lexcql.adoc @@ -0,0 +1,91 @@ += LexCQL +:description: LexCQL query language. + +The LexCQL makes use of the <> for queries on lexical resources in the context of the FCS. This has the benefit of using an existing, well-known and standardized query language with an established ecosystem, including libraries, parsers and extensive documentation. The proposed Context Set can be found in section <>. + +LexCQL queries should offer the greatest possible compatibility and flexibility to users, i.e. it `SHOULD` allow the retrieval of lexical records which are available in different spelling or normalisation variants (upper/lower case; diacritics/umlauts; other forms of normalization) with a simple query. It therefore should make it easier to formulate meaningful queries, reduce frustration caused by missing or incomplete results and also enable fuzzy search functionality. Endpoints `SHOULD` support this flexible, user-oriented handling, but are always free to rank more suitable results higher. + +However, users `SHOULD` also be given the option of _"sharpening"_ search queries using optional operators or modifiers, to refine queries and the associated result sets. + + +== Queryable fields / indexes +Every LexCQL Endpoint `MUST` support queries for field `lemma` (at least implicitly as default field) and `SHOULD` support as many queryable fields as feasible. + +:fn-lemma: pass:n[footnote:[`SHOULD` be used as default query field]] +:fn-pos: footnote:[Potential use of the <>] + +=== Form and sense definition + +* `lemma`: Lemma, main form, article name {fn-lemma} + +* `def`: Definition + +=== Grammatical properties (WIP) + +NOTE: Analogous to https://dariah-eric.github.io/lexicalresources/pages/TEILex0/TEILex0.html#typology-of-gram[TEI Lex-0 types] . + +* `pos`: Part of speech {fn-pos} + +NOTE: __Open question__: all TEI Lex-0 vocabulary (like case, gender, etc.)? + + +=== Semantic relations / Cross-references _(WIP)_ + +NOTE: Analogous to https://dariah-eric.github.io/lexicalresources/pages/TEILex0/TEILex0.html#crossref_typology[TEI Lex-0 types] with `xr` Context Set. + +* `xr$synonymy`: Synonym +* `xr$hyponymy`: Hyponym +* `xr$hypernymy`: Hyperonym +* `xr$meronymy`: Meronym +* `xr$antonymy`: Antonym + +NOTE: __Open question__: how exactly do search scenarios look like here? + +[discrete] +==== Examples + +* `pos = "NOUN" AND xr$synonymy = "house"` ++ +Searching for nouns that are synonyms to "house". + + +=== Senses, Entities, Synsets _(WIP)_ + +* `senseRef`: References to external sense definitions (like Princeton WordNet, GermaNet, Interlingual Index, authority files) + + +[discrete] +==== Examples + +* `senseRef = "https://d-nb.info/gnd/118571249"` ++ +Searching for all references of the entity https://d-nb.info/gnd/118571249 (person "Gottfried Wilhelm Leibniz"). + +* `senseRef = "wordnet:study%2:31:02::"` ++ +Searching for sense "study" (http://wordnetweb.princeton.edu/perl/webwn?c=6&sub=Change&o2=&o0=1&o8=1&o1=1&o7=&o5=&o9=&o6=1&o3=&o4=&i=12&h=010000000000100000000&s=study[`study%2:31:02::`, _analyze, analyse, study, examine, canvass_]). ++ +NOTE: Some inventory of authorities or lookups should be defined and agreed on. Otherwise, it is up to the Endpoint to interpret the senses correctly. + +_Open questions_: + + *** Standard way of referencing Princeton WordNet senses? + + +=== Operators _(WIP)_ + +* Partial or full match of an _index_ + +** *`=`* for full match? ++ +NOTE: Very generic definition in <> specification. ++ +_Open questions_: + +*** Taking into account lower/uppercase normalisation of Umlauts? +*** CQL operators for partial matches or usage of `/approx`? +*** Regular expressions? + +* Combination with `AND` / `OR` and parentheses `(` ... `)` ++ +NOTE: Leave full support to the endpoints! diff --git a/lexfcs/themes b/lexfcs/themes new file mode 120000 index 0000000..de90031 --- /dev/null +++ b/lexfcs/themes @@ -0,0 +1 @@ +../themes \ No newline at end of file diff --git a/themes/textplus-logo.svg b/themes/textplus-logo.svg new file mode 100644 index 0000000..f750ccd --- /dev/null +++ b/themes/textplus-logo.svg @@ -0,0 +1,16 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/themes/textplus-theme.yml b/themes/textplus-theme.yml new file mode 100644 index 0000000..ab9fe72 --- /dev/null +++ b/themes/textplus-theme.yml @@ -0,0 +1,24 @@ +extends: default +header: + font-color: #999999 + height: 0.5in + recto: + columns: '<30% =0% >70%' + right: + content: '_{doctitle}_' + left: + content: image:textplus-logo.svg[pdfwidth=0.45in] + verso: + columns: '<70% =0% >30%' + left: + content: $header_recto_right_content + right: + content: $header_recto_left_content +footer: + height: 0.45in + recto: + right: + content: '{section-or-chapter-title} | {page-number}' + verso: + left: + content: '{page-number} | {chapter-title}'