made a module to split Japanese words

osm-search · Aug 18, 2023 · 8f43956 · 8f43956
1 parent 4559886
commit 8f43956
Show file tree

Hide file tree

Showing 5 changed files with 212 additions and 8 deletions.
diff --git a/nominatim/api/search/icu_tokenizer.py b/nominatim/api/search/icu_tokenizer.py
@@ -7,7 +7,7 @@
 """
 Implementation of query analysis for the ICU tokenizer.
 """
-from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast
+from typing import Tuple, Dict, List, Optional, NamedTuple, Iterator, Any, cast, Set
 from copy import copy
 from collections import defaultdict
 import dataclasses
@@ -22,7 +22,7 @@
 from nominatim.api.logging import log
 from nominatim.api.search import query as qmod
 from nominatim.api.search.query_analyzer_factory import AbstractQueryAnalyzer
-
+from nominatim.api.search import icu_tokenizer_japanese
 
 DB_TO_TOKEN_TYPE = {
     'W': qmod.TokenType.WORD,
@@ -161,10 +161,14 @@ async def analyze_query(self, phrases: List[qmod.Phrase]) -> qmod.QueryStruct:
             tokenized query.
         """
         log().section('Analyze query (using ICU tokenizer)')
-        normalized = list(filter(lambda p: p.text,
-                                 (qmod.Phrase(p.ptype, self.normalize_text(p.text))
-                                  for p in phrases)))
-        query = qmod.QueryStruct(normalized)
+        preprocess_query_functions = [
+            self.normalize_phrases,
+            icu_tokenizer_japanese.split_key_japanese_phrases
+        ]
+        for func in preprocess_query_functions:
+            phrases = func(phrases)
+
+        query = qmod.QueryStruct(phrases)
         log().var_dump('Normalized query', query.source)
         if not query.source:
             return query
@@ -203,6 +207,25 @@ def normalize_text(self, text: str) -> str:
         """
         return cast(str, self.normalizer.transliterate(text))
 
+    def normalize_phrases(
+        self, phrases: List[qmod.Phrase]
+    ) -> List[qmod.Phrase]:
+        """Normalize the phrases
+        """
+        normalized = list(filter(lambda p: p.text,
+                                 (qmod.Phrase(p.ptype, self.normalize_text(p.text))
+                                  for p in phrases)))
+        return normalized
+
+    def split_key_japanese_phrases(
+        self, phrases: List[qmod.Phrase]
+    ) -> List[qmod.Phrase]:
+        """Split a Japanese address using japanese_tokenizer.
+        """
+        splited_address = list(filter(lambda p: p.text,
+                                (qmod.Phrase(p.ptype, icu_tokenizer_japanese.transliterate(p.text))
+                                for p in phrases)))
+        return splited_address
 
     def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
         """ Transliterate the phrases and split them into tokens.
@@ -224,7 +247,10 @@ def split_query(self, query: qmod.QueryStruct) -> Tuple[QueryParts, WordDict]:
                         if term:
                             parts.append(QueryPart(term, word, wordnr))
                             query.add_node(qmod.BreakType.TOKEN, phrase.ptype)
-                    query.nodes[-1].btype = qmod.BreakType.WORD
+                    if word[-1] == ',':
+                        query.nodes[-1].btype = qmod.BreakType.SOFT_PHRASE
+                    else:
+                        query.nodes[-1].btype = qmod.BreakType.WORD
                 wordnr += 1
             query.nodes[-1].btype = qmod.BreakType.PHRASE
 
@@ -254,11 +280,34 @@ def add_extra_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
                 query.add_token(qmod.TokenRange(i, i+1), qmod.TokenType.HOUSENUMBER,
                                 ICUToken(0.5, 0, 1, part.token, True, part.token, None))
 
+    def collect_soft_phrase_indexes(self, query: qmod.QueryStruct) -> Set[int]:
+        """Create a set of indexes of nodes with soft_phrase.
+        """
+        soft_phrase_idx_set = set()
+        for i, node in enumerate(query.nodes):
+            if node.btype == qmod.BreakType.SOFT_PHRASE:
+                soft_phrase_idx_set.add(i)
+        return soft_phrase_idx_set
+
+    def add_soft_phrase_penalties(
+        self,
+        i: int,
+        tlist: qmod.TokenList,
+        soft_phrase_idx_set: Set[int]
+    ) -> None:
+        """This function adds penalties to tokens based on the presence of soft phrases.
+        """
+        for key in soft_phrase_idx_set:
+            if i < key and key < tlist.end:
+                tlist.add_penalty(0.5)
 
     def rerank_tokens(self, query: qmod.QueryStruct, parts: QueryParts) -> None:
         """ Add penalties to tokens that depend on presence of other token.
         """
+        soft_phrase_idx_set = self.collect_soft_phrase_indexes(query)
+
         for i, node, tlist in query.iter_token_lists():
+            self.add_soft_phrase_penalties(i, tlist, soft_phrase_idx_set)
             if tlist.ttype == qmod.TokenType.POSTCODE:
                 for repl in node.starting:
                     if repl.end == tlist.end and repl.ttype != qmod.TokenType.POSTCODE \

diff --git a/nominatim/api/search/icu_tokenizer_japanese.py b/nominatim/api/search/icu_tokenizer_japanese.py
@@ -0,0 +1,67 @@
+# from nominatim.tokenizer.sanitizers.tag_japanese import convert_kanji_sequence_to_number
+
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2023 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+This file divides Japanese addresses into three categories:
+prefecture, municipality, and other.
+The division is not strict but simple using these keywords.
+Based on this division, icu_tokenizer.py inserts
+a SOFT_PHRASE break between the divided words 
+and penalizes the words with this SOFT_PHRASE
+to lower the search priority.
+"""
+import re
+from typing import List
+from nominatim.api.search import query as qmod
+
+def transliterate(text: str) -> str:
+    """
+    This function performs a division on the given text using a regular expression.
+    """
+    pattern_full = r'''
+               (...??[都道府県])            # [group1] prefecture
+               (.+?[市区町村])              # [group2] municipalities (city/wards/towns/villages)
+               (.+)                         # [group3] other words
+               '''
+    pattern_1 = r'''
+               (...??[都道府県])            # [group1] prefecture
+               (.+)                         # [group3] other words
+               '''
+    pattern_2 = r'''
+               (.+?[市区町村])              # [group2] municipalities (city/wards/towns/villages)
+               (.+)                         # [group3] other words
+               '''
+    result_full = re.match(pattern_full, text, re.VERBOSE)
+    result_1 = re.match(pattern_1, text, re.VERBOSE)
+    result_2 = re.match(pattern_2, text, re.VERBOSE)
+    if result_full is not None:
+        joined_group = ''.join([
+                                result_full.group(1),
+                                ', ',
+                                result_full.group(2),
+                                ', ',
+                                result_full.group(3)
+                               ])
+        return joined_group
+    if result_1 is not None:
+        joined_group = ''.join([result_1.group(1),', ',result_1.group(2)])
+        return joined_group
+    if result_2 is not None:
+        joined_group = ''.join([result_2.group(1),', ',result_2.group(2)])
+        return joined_group
+    return text
+
+def split_key_japanese_phrases(
+    phrases: List[qmod.Phrase]
+) -> List[qmod.Phrase]:
+    """Split a Japanese address using japanese_tokenizer.
+    """
+    splited_address = list(filter(lambda p: p.text,
+                            (qmod.Phrase(p.ptype, transliterate(p.text))
+                            for p in phrases)))
+    return splited_address
diff --git a/nominatim/api/search/query.py b/nominatim/api/search/query.py
@@ -29,6 +29,7 @@ class BreakType(enum.Enum):
     """ Break created as a result of tokenization.
         This may happen in languages without spaces between words.
     """
+    SOFT_PHRASE = ':'
 
 
 class TokenType(enum.Enum):

diff --git a/nominatim/api/search/token_assignment.py b/nominatim/api/search/token_assignment.py
@@ -30,7 +30,8 @@ class TypedRange:
     qmod.BreakType.PHRASE: 0.0,
     qmod.BreakType.WORD: 0.1,
     qmod.BreakType.PART: 0.2,
-    qmod.BreakType.TOKEN: 0.4
+    qmod.BreakType.TOKEN: 0.4,
+    qmod.BreakType.SOFT_PHRASE: 0.0
 }
 
 TypedRangeSeq = List[TypedRange]

diff --git a/test/python/api/search/test_icu_japanese_query_analyzer.py b/test/python/api/search/test_icu_japanese_query_analyzer.py
@@ -0,0 +1,86 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+#
+# This file is part of Nominatim. (https://nominatim.org)
+#
+# Copyright (C) 2023 by the Nominatim developer community.
+# For a full list of authors see the git log.
+"""
+Tests for query analyzer for ICU tokenizer.
+"""
+from pathlib import Path
+
+import pytest
+import pytest_asyncio
+
+from nominatim.api import NominatimAPIAsync
+from nominatim.api.search.query import Phrase, PhraseType, BreakType
+import nominatim.api.search.icu_tokenizer as tok
+
+async def add_word(conn, word_id, word_token, wtype, word, info = None):
+    t = conn.t.meta.tables['word']
+    await conn.execute(t.insert(), {'word_id': word_id,
+                                    'word_token': word_token,
+                                    'type': wtype,
+                                    'word': word,
+                                    'info': info})
+
+
+def make_phrase(query):
+    return [Phrase(PhraseType.NONE, s) for s in query.split(',')]
+@pytest_asyncio.fixture
+async def conn(table_factory):
+    """ Create an asynchronous SQLAlchemy engine for the test DB.
+    """
+    table_factory('nominatim_properties',
+                  definition='property TEXT, value TEXT',
+                  content=(('tokenizer_import_normalisation', ':: lower();'),
+                           ('tokenizer_import_transliteration', "'1' > '/1/'; 'ä' > 'ä '")))
+    table_factory('word',
+                  definition='word_id INT, word_token TEXT, type TEXT, word TEXT, info JSONB')
+
+    api = NominatimAPIAsync(Path('/invalid'), {})
+    async with api.begin() as conn:
+        yield conn
+    await api.close()
+@pytest.mark.asyncio
+async def test_soft_phrase(conn):
+    ana = await tok.create_query_analyzer(conn)
+
+    await add_word(conn, 100, 'da', 'w', None)
+    await add_word(conn, 101, 'ban', 'w', None)
+    await add_word(conn, 102, 'fu', 'w', None)
+    await add_word(conn, 103, 'shi', 'w', None)
+
+    await add_word(conn, 1, 'da ban fu', 'W', '大阪府')
+    await add_word(conn, 2, 'da ban shi', 'W', '大阪市')
+    await add_word(conn, 3, 'da ban', 'W', '大阪')
+    query = await ana.analyze_query(make_phrase('大阪府大阪市大阪'))
+    assert query.nodes[0].btype == BreakType.START
+    assert query.nodes[1].btype == BreakType.SOFT_PHRASE
+    assert query.nodes[2].btype == BreakType.SOFT_PHRASE
+    assert query.nodes[3].btype == BreakType.END
+
+    query2 = await ana.analyze_query(make_phrase('大阪府大阪'))
+    assert query2.nodes[1].btype == BreakType.SOFT_PHRASE
+
+    query3 = await ana.analyze_query(make_phrase('大阪市大阪'))
+    assert query3.nodes[1].btype == BreakType.SOFT_PHRASE
+
+@pytest.mark.asyncio
+async def test_penalty_soft_phrase(conn):
+    ana = await tok.create_query_analyzer(conn)
+
+    await add_word(conn, 104, 'da', 'w', 'da')
+    await add_word(conn, 105, 'ban', 'w', 'ban')
+    await add_word(conn, 107, 'shi', 'w', 'shi')
+
+    await add_word(conn, 2, 'da ban shi', 'W', '大阪市')
+    await add_word(conn, 3, 'da ban', 'W', '大阪')
+    await add_word(conn, 4, 'da ban shi da ban', 'W', '大阪市大阪')
+
+    query = await ana.analyze_query(make_phrase('da ban shi da ban'))
+
+    torder = [(tl.tokens[0].penalty, tl.tokens[0].lookup_word) for tl in query.nodes[0].starting]
+    torder.sort()
+
+    assert torder[-1][-1] == '大阪市大阪'