diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py index b68e8d10e..4ae106da5 100644 --- a/nominatim/api/search/icu_tokenizer.py +++ b/nominatim/api/search/icu_tokenizer.py @@ -7,7 +7,7 @@ """ Implementation of query analysis for the ICU tokenizer. """ -from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast +from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast, Set from copy import copy from collections import defaultdict import dataclasses @@ -22,7 +22,7 @@ from nominatim.api.logging import log from nominatim.api.search import query as qmod from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer - +from nominatim.api.search import icu_tokenizer_japanese DB_TO_TOKEN_TYPE = { 'W': qmod.TokenType.WORD, @@ -161,10 +161,14 @@ async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct: tokenized query. """ log().section('Analyze query (using ICU tokenizer)') - normalized = list(filter(lambda p: p.text, - (qmod.Phrase(p.ptype, self.normalize_text(p.text)) - for p in phrases))) - query = qmod.QueryStruct(normalized) + preprocess_query_functions = [ + self.normalize_phrases, + icu_tokenizer_japanese.split_key_japanese_phrases + ] + for func in preprocess_query_functions: + phrases = func(phrases) + + query = qmod.QueryStruct(phrases) log().var_dump('Normalized query', query.source) if not query.source: return query @@ -203,6 +207,25 @@ def normalize_text(self, text: str) -> str: """ return cast(str, self.normalizer.transliterate(text)) + def normalize_phrases( + self, phrases: List[qmod.Phrase] + ) -> List[qmod.Phrase]: + """Normalize the phrases + """ + normalized = list(filter(lambda p: p.text, + (qmod.Phrase(p.ptype, self.normalize_text(p.text)) + for p in phrases))) + return normalized + + def split_key_japanese_phrases( + self, phrases: List[qmod.Phrase] + ) -> List[qmod.Phrase]: + """Split a Japanese address using japanese_tokenizer. + """ + splited_address = list(filter(lambda p: p.text, + (qmod.Phrase(p.ptype, icu_tokenizer_japanese.transliterate(p.text)) + for p in phrases))) + return splited_address def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: """ Transliterate the phrases and split them into tokens. @@ -224,7 +247,10 @@ def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]: if term: parts.append(QueryPart(term, word, wordnr)) query.add_node(qmod.BreakType.TOKEN, phrase.ptype) - query.nodes[-1].btype = qmod.BreakType.WORD + if word[-1] == ',': + query.nodes[-1].btype = qmod.BreakType.SOFT_PHRASE + else: + query.nodes[-1].btype = qmod.BreakType.WORD wordnr += 1 query.nodes[-1].btype = qmod.BreakType.PHRASE @@ -254,11 +280,34 @@ def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER, ICUToken(0.5, 0, 1, part.token, True, part.token, None)) + def collect_soft_phrase_indexes(self, query: qmod.QueryStruct) -> Set[int]: + """Create a set of indexes of nodes with soft_phrase. + """ + soft_phrase_idx_set = set() + for i, node in enumerate(query.nodes): + if node.btype == qmod.BreakType.SOFT_PHRASE: + soft_phrase_idx_set.add(i) + return soft_phrase_idx_set + + def add_soft_phrase_penalties( + self, + i: int, + tlist: qmod.TokenList, + soft_phrase_idx_set: Set[int] + ) -> None: + """This function adds penalties to tokens based on the presence of soft phrases. + """ + for key in soft_phrase_idx_set: + if i < key and key < tlist.end: + tlist.add_penalty(0.5) def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None: """ Add penalties to tokens that depend on presence of other token. """ + soft_phrase_idx_set = self.collect_soft_phrase_indexes(query) + for i, node, tlist in query.iter_token_lists(): + self.add_soft_phrase_penalties(i, tlist, soft_phrase_idx_set) if tlist.ttype == qmod.TokenType.POSTCODE: for repl in node.starting: if repl.end == tlist.end and repl.ttype != qmod.TokenType.POSTCODE \ diff --git a/nominatim/api/search/icu_tokenizer_japanese.py b/nominatim/api/search/icu_tokenizer_japanese.py new file mode 100644 index 000000000..ff490a321 --- /dev/null +++ b/nominatim/api/search/icu_tokenizer_japanese.py @@ -0,0 +1,67 @@ +# from nominatim.tokenizer.sanitizers.tag_japanese import convert_kanji_sequence_to_number + +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2023 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +This file divides Japanese addresses into three categories: +prefecture, municipality, and other. +The division is not strict but simple using these keywords. +Based on this division, icu_tokenizer.py inserts +a SOFT_PHRASE break between the divided words +and penalizes the words with this SOFT_PHRASE +to lower the search priority. +""" +import re +from typing import List +from nominatim.api.search import query as qmod + +def transliterate(text: str) -> str: + """ + This function performs a division on the given text using a regular expression. + """ + pattern_full = r''' + (...??[都道府県]) # [group1] prefecture + (.+?[市区町村]) # [group2] municipalities (city/wards/towns/villages) + (.+) # [group3] other words + ''' + pattern_1 = r''' + (...??[都道府県]) # [group1] prefecture + (.+) # [group3] other words + ''' + pattern_2 = r''' + (.+?[市区町村]) # [group2] municipalities (city/wards/towns/villages) + (.+) # [group3] other words + ''' + result_full = re.match(pattern_full, text, re.VERBOSE) + result_1 = re.match(pattern_1, text, re.VERBOSE) + result_2 = re.match(pattern_2, text, re.VERBOSE) + if result_full is not None: + joined_group = ''.join([ + result_full.group(1), + ', ', + result_full.group(2), + ', ', + result_full.group(3) + ]) + return joined_group + if result_1 is not None: + joined_group = ''.join([result_1.group(1),', ',result_1.group(2)]) + return joined_group + if result_2 is not None: + joined_group = ''.join([result_2.group(1),', ',result_2.group(2)]) + return joined_group + return text + +def split_key_japanese_phrases( + phrases: List[qmod.Phrase] +) -> List[qmod.Phrase]: + """Split a Japanese address using japanese_tokenizer. + """ + splited_address = list(filter(lambda p: p.text, + (qmod.Phrase(p.ptype, transliterate(p.text)) + for p in phrases))) + return splited_address diff --git a/nominatim/api/search/query.py b/nominatim/api/search/query.py index 5d75eb0fb..81c60f18b 100644 --- a/nominatim/api/search/query.py +++ b/nominatim/api/search/query.py @@ -29,6 +29,7 @@ class BreakType(enum.Enum): """ Break created as a result of tokenization. This may happen in languages without spaces between words. """ + SOFT_PHRASE = ':' class TokenType(enum.Enum): diff --git a/nominatim/api/search/token_assignment.py b/nominatim/api/search/token_assignment.py index 3f0e737b0..3d69a6ddb 100644 --- a/nominatim/api/search/token_assignment.py +++ b/nominatim/api/search/token_assignment.py @@ -30,7 +30,8 @@ class TypedRange: qmod.BreakType.PHRASE: 0.0, qmod.BreakType.WORD: 0.1, qmod.BreakType.PART: 0.2, - qmod.BreakType.TOKEN: 0.4 + qmod.BreakType.TOKEN: 0.4, + qmod.BreakType.SOFT_PHRASE: 0.0 } TypedRangeSeq = List[TypedRange] diff --git a/test/python/api/search/test_icu_japanese_query_analyzer.py b/test/python/api/search/test_icu_japanese_query_analyzer.py new file mode 100644 index 000000000..29e19cf55 --- /dev/null +++ b/test/python/api/search/test_icu_japanese_query_analyzer.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: GPL-3.0-or-later +# +# This file is part of Nominatim. (https://nominatim.org) +# +# Copyright (C) 2023 by the Nominatim developer community. +# For a full list of authors see the git log. +""" +Tests for query analyzer for ICU tokenizer. +""" +from pathlib import Path + +import pytest +import pytest_asyncio + +from nominatim.api import NominatimAPIAsync +from nominatim.api.search.query import Phrase, PhraseType, BreakType +import nominatim.api.search.icu_tokenizer as tok + +async def add_word(conn, word_id, word_token, wtype, word, info = None): + t = conn.t.meta.tables['word'] + await conn.execute(t.insert(), {'word_id': word_id, + 'word_token': word_token, + 'type': wtype, + 'word': word, + 'info': info}) + + +def make_phrase(query): + return [Phrase(PhraseType.NONE, s) for s in query.split(',')] +@pytest_asyncio.fixture +async def conn(table_factory): + """ Create an asynchronous SQLAlchemy engine for the test DB. + """ + table_factory('nominatim_properties', + definition='property TEXT, value TEXT', + content=(('tokenizer_import_normalisation', ':: lower();'), + ('tokenizer_import_transliteration', "'1' > '/1/'; 'ä' > 'ä '"))) + table_factory('word', + definition='word_id INT, word_token TEXT, type TEXT, word TEXT, info JSONB') + + api = NominatimAPIAsync(Path('/invalid'), {}) + async with api.begin() as conn: + yield conn + await api.close() +@pytest.mark.asyncio +async def test_soft_phrase(conn): + ana = await tok.create_query_analyzer(conn) + + await add_word(conn, 100, 'da', 'w', None) + await add_word(conn, 101, 'ban', 'w', None) + await add_word(conn, 102, 'fu', 'w', None) + await add_word(conn, 103, 'shi', 'w', None) + + await add_word(conn, 1, 'da ban fu', 'W', '大阪府') + await add_word(conn, 2, 'da ban shi', 'W', '大阪市') + await add_word(conn, 3, 'da ban', 'W', '大阪') + query = await ana.analyze_query(make_phrase('大阪府大阪市大阪')) + assert query.nodes[0].btype == BreakType.START + assert query.nodes[1].btype == BreakType.SOFT_PHRASE + assert query.nodes[2].btype == BreakType.SOFT_PHRASE + assert query.nodes[3].btype == BreakType.END + + query2 = await ana.analyze_query(make_phrase('大阪府大阪')) + assert query2.nodes[1].btype == BreakType.SOFT_PHRASE + + query3 = await ana.analyze_query(make_phrase('大阪市大阪')) + assert query3.nodes[1].btype == BreakType.SOFT_PHRASE + +@pytest.mark.asyncio +async def test_penalty_soft_phrase(conn): + ana = await tok.create_query_analyzer(conn) + + await add_word(conn, 104, 'da', 'w', 'da') + await add_word(conn, 105, 'ban', 'w', 'ban') + await add_word(conn, 107, 'shi', 'w', 'shi') + + await add_word(conn, 2, 'da ban shi', 'W', '大阪市') + await add_word(conn, 3, 'da ban', 'W', '大阪') + await add_word(conn, 4, 'da ban shi da ban', 'W', '大阪市大阪') + + query = await ana.analyze_query(make_phrase('da ban shi da ban')) + + torder = [(tl.tokens[0].penalty, tl.tokens[0].lookup_word) for tl in query.nodes[0].starting] + torder.sort() + + assert torder[-1][-1] == '大阪市大阪'