MPMG-DCC-UFMG · rennancl · Aug 25, 2022 · Sep 1, 2022 · Sep 2, 2022 · Sep 30, 2022
diff --git a/api/requirements.txt b/api/requirements.txt
@@ -95,3 +95,4 @@ wasabi==0.9.0
 Werkzeug==2.0.3
 wrapt==1.14.0
 zipp==3.7.0
+xgboost==1.6.2
diff --git a/indexer/requirements.txt b/indexer/requirements.txt
@@ -28,3 +28,4 @@ transformers==4.22.2
 typing_extensions==4.3.0
 urllib3==1.26.12
 zipp==3.8.1
+xgboost==1.6.2
diff --git a/install.sh b/install.sh
@@ -6,29 +6,23 @@ export LD_RUN_PATH=/usr/local/lib
 ########################################
 
 # Installing Requirements
-pip install --use-deprecated=legacy-resolver -r requirements.txt
+pip install --use-deprecated=legacy-resolver -r ./indexer/requirements.txt
 if ! [[ -z $(pip freeze | grep "torch==1.6.0+cu101") ]]; then
     pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
 fi
 
 ########################################
 
-# Install ElasticSearch
+# Install ElasticSearch and LTR Plugin
 echo "Check if ElasticSearch is installed..."
 if ! [[ -d "elasticsearch-7.10.2" ]]; then
     rm elasticsearch-7.10.2-linux-x86_64.tar.gz*
     echo "Installing ElasticSearch..."
     wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.10.2-linux-x86_64.tar.gz
     tar -xvzf elasticsearch-7.10.2-linux-x86_64.tar.gz
     rm elasticsearch-7.10.2-linux-x86_64.tar.gz
-fi
-
-########################################
 
-# Setting Elasticsearch
-echo "Checking if ElasticSearch is ON..."
-if [[ -z $(fuser 9200/tcp) ]]; then
-    echo "Initializing ElasticSearch..."
+    echo "Initializing ElasticSearch to install Plugin"
     cd ./elasticsearch-7.10.2
 
     # remove configuracoes de segurança
@@ -39,19 +33,43 @@ if [[ -z $(fuser 9200/tcp) ]]; then
     ES_JAVA_OPTS=-Xmx2g ./bin/elasticsearch -d
     echo "Waiting ElasticSearch..."
     sleep 60s
+    ./bin/elasticsearch-plugin install https://github.com/o19s/elasticsearch-learning-to-rank/releases/download/v1.5.4-es7.10.2/ltr-plugin-v1.5.4-es7.10.2.zip
+    fuser -k 9200/tcp
+
     cd ..
 fi
 
 ########################################
 
-# Indexing Documents
+# Setting Elasticsearch
+echo "Checking if ElasticSearch is ON..."
+if [[ -z $(fuser 9200/tcp) ]]; then
+    echo "Initializing ElasticSearch..."
+    cd ./elasticsearch-7.10.2
+    ES_JAVA_OPTS=-Xmx2g ./bin/elasticsearch -d
+    echo "Waiting ElasticSearch..."
+    sleep 60s
+    cd ..
+fi
+# depois de instalado, tem de reiniciar o ES para funcionar
+########################################
+
+#Indexing Documents
 echo ""
 echo "Indexing documents..."
 cd ./indexer
 if [[ -z $(curl -X GET "localhost:9200/_cat/indices/*") ]]; then
     ./indexing_script.sh
 fi
+########################################
 
+# Setting Up LTR
+export PYTHONPATH=./
+echo ""
+echo "Setting Up LTR..."
+cd ..
+python ./misc/ltr/set_ltr.py
+#load trained model
 ########################################
 
 docker-compose build
diff --git a/misc/feedback/candidates_generation.py b/misc/feedback/candidates_generation.py
@@ -4,6 +4,8 @@
 from tqdm import tqdm
 import requests
 
+CLIENT = "gsi"
+
 def format_text(text):
     import re
     text = re.sub('\n+', '\n', text)
@@ -30,7 +32,7 @@ def apply_similarity(method):
             try:
                 query = q["query"].replace("/", "")
                 params = {'consulta': query,  'page': 1, 'sid': '123'}
-                service_response = requests.get('http://localhost:8000/services/search', params)
+                service_response = requests.get(f'http://localhost:8000/services/{CLIENT}/search', params)
                 results = json.loads(service_response.text)
                 feedback_line = {}
                 feedback_line["text"] = query

diff --git a/misc/feedback/candidates_processing.py b/misc/feedback/candidates_processing.py
@@ -41,13 +41,13 @@
     for person, data_list in feedback.items():
         for entry in data_list:
             new_corresponding = ""
-            query_tokens = entry["query"].split()
+            query_tokens = entry["query"].lower().split()
             query_tokens = [token for token in query_tokens if not token in stopwords]
             color_codes = dict(zip(query_tokens, style_list[:len(query_tokens)]))
             for corres in entry["corresponding"]:
                 for token in corres["text"].split():
-                    if token in query_tokens:
-                        new_corresponding += f"<mark {color_codes[token]}>" + token + "</mark> "
+                    if token.lower() in query_tokens:
+                        new_corresponding += f"<mark {color_codes[token.lower()]}>" + token + "</mark> "
                     else:
                         new_corresponding += token + " "
                 corres["formatted_text"] = new_corresponding

diff --git a/misc/ltr/collect_features.py b/misc/ltr/collect_features.py
@@ -48,13 +48,14 @@ def featureDictToList(ranklibLabeledFeatures):
 
 def log_features(es, judgmentsByQid):
     for qid, judgments in judgmentsByQid.items():
-        query = judgments[0].query
+        query = judgments[0].keywords
         doc_ids = [judgment.docId for judgment in judgments]
         logQuery['query']['bool']['must'][0]['terms']['_id'] = doc_ids
         logQuery['query']['bool']['should'][0]['sltr']['params']['consulta'] = query
         print("POST")
         print(json.dumps(logQuery, indent=2))
-        res = es.search(index='tmdb', body=logQuery)
+        #revisar essa parte
+        res = es.search(index='processos', body=logQuery)
         # Add feature back to each judgment
         featuresPerDoc = {}
         for doc in res['hits']['hits']:
@@ -68,7 +69,7 @@ def log_features(es, judgmentsByQid):
                 features = featuresPerDoc[judgment.docId] # If KeyError, then we have a judgment but no movie in index
                 judgment.features = features
             except KeyError:
-                print("Missing movie %s" % judgment.docId)
+                print("Missing document %s" % judgment.docId)
 
 
 def build_train_file(judgmentsWithFeatures, filename):
@@ -79,9 +80,10 @@ def build_train_file(judgmentsWithFeatures, filename):
 
 
 if __name__ == "__main__":
-    from judgments import judgmentsFromFile, judgmentsByQid
+    from ltr.judgments import judgmentsFromFile, judgmentsByQid
     from elasticsearch import Elasticsearch
     es = Elasticsearch()
-    judgmentsByQid = judgmentsByQid(judgmentsFromFile('sample_judgments.txt'))
+    judgmentsByQid = judgmentsByQid(judgmentsFromFile('./ltr/human_judgments.txt'))
+    print(judgmentsByQid)
     log_features(es, judgmentsByQid)
-    build_train_file(judgmentsByQid, "sample_judgments_wfeatures.txt")
+    build_train_file(judgmentsByQid, "./ltr/human_judgments_wfeatures.txt")
diff --git a/misc/ltr/human_judgments.txt b/misc/ltr/human_judgments.txt
@@ -0,0 +1,32 @@
+# qid:1: não gerou documento*1
+# qid:2: documento*1
+# qid:3: processo*1
+
+0	qid:3	 # 90d770145b6eef3274e81ae18727498a5d02a73a	processo
+2	qid:3	 # 0233b59d4625333383eed758b767bde71f34114a	processo
+4	qid:3	 # d8f55c963d088b52f86e4e65311ef33152ff0781	processo
+2	qid:3	 # 72a65ed97b3d4e104cc86d9ead3bbfbdb44b493f	processo
+0	qid:3	 # 90d770145b6eef3274e81ae18727498a5d02a73a	processo
+2	qid:2	 # 0233b59d4625333383eed758b767bde71f34114a	documento
+4	qid:2	 # d8f55c963d088b52f86e4e65311ef33152ff0781	documento
+2	qid:2	 # 72a65ed97b3d4e104cc86d9ead3bbfbdb44b493f	documento
+0	qid:2	 # 90d770145b6eef3274e81ae18727498a5d02a73a	documento
+2	qid:2	 # 0233b59d4625333383eed758b767bde71f34114a	documento
+4	qid:2	 # d8f55c963d088b52f86e4e65311ef33152ff0781	documento
+1	qid:1	 # 72a65ed97b3d4e104cc86d9ead3bbfbdb44b493f	não gerou documento
+0	qid:1	 # 90d770145b6eef3274e81ae18727498a5d02a73a	não gerou documento
+2	qid:1	 # 0233b59d4625333383eed758b767bde71f34114a	não gerou documento
+4	qid:1	 # d8f55c963d088b52f86e4e65311ef33152ff0781	não gerou documento
+1	qid:1	 # 72a65ed97b3d4e104cc86d9ead3bbfbdb44b493f	não gerou documento
+0	qid:1	 # 90d770145b6eef3274e81ae18727498a5d02a73a	não gerou documento
+2	qid:1	 # 0233b59d4625333383eed758b767bde71f34114a	não gerou documento
+4	qid:1	 # d8f55c963d088b52f86e4e65311ef33152ff0781	não gerou documento
+1	qid:1	 # 72a65ed97b3d4e104cc86d9ead3bbfbdb44b493f	não gerou documento
+0	qid:1	 # 90d770145b6eef3274e81ae18727498a5d02a73a	não gerou documento
+2	qid:1	 # 0233b59d4625333383eed758b767bde71f34114a	não gerou documento
+4	qid:1	 # d8f55c963d088b52f86e4e65311ef33152ff0781	não gerou documento
+1	qid:1	 # 72a65ed97b3d4e104cc86d9ead3bbfbdb44b493f	não gerou documento
+0	qid:1	 # 90d770145b6eef3274e81ae18727498a5d02a73a	não gerou documento
+2	qid:1	 # 0233b59d4625333383eed758b767bde71f34114a	não gerou documento
+4	qid:1	 # d8f55c963d088b52f86e4e65311ef33152ff0781	não gerou documento
+1	qid:1	 # 72a65ed97b3d4e104cc86d9ead3bbfbdb44b493f	não gerou documento
diff --git a/misc/ltr/judgements.txt b/misc/ltr/judgements.txt
diff --git a/misc/ltr/judgments.py b/misc/ltr/judgments.py
@@ -0,0 +1,122 @@
+import re
+
+class Judgment:
+    def __init__(self, grade, qid, keywords, docId, weight=1):
+        self.grade = grade
+        self.qid = qid
+        self.keywords = keywords
+        self.docId = docId
+        self.features = [] # 0th feature is ranklib feature 1
+        self.weight = weight
+
+    def sameQueryAndDoc(self, other):
+        return self.qid == other.qid and self.docId == other.docId
+
+    def __str__(self):
+        return "grade:%s qid:%s (%s) docid:%s" % (self.grade, self.qid, self.keywords, self.docId)
+
+    def toRanklibFormat(self):
+        featuresAsStrs = ["%s:%s" % (idx+1, feature) for idx, feature in enumerate(self.features)]
+        comment = "# %s\t%s" % (self.docId, self.keywords)
+        return "%s\tqid:%s\t%s %s" % (self.grade, self.qid, "\t".join(featuresAsStrs), comment)
+
+
+def _queriesToHeader(qidToKwDict):
+    rVal = ""
+    for qid, kws in qidToKwDict.items():
+        rVal += "# qid:%s: %s" % (qid, kws[0])
+        rVal += "*%s\n" % kws[1]
+    rVal += "\n"
+    return rVal
+
+
+def _queriesFromHeader(lines):
+    """ Parses out mapping between, query id and user keywords
+        from header comments, ie:
+        # qid:523: First Blood
+        returns dict mapping all query ids to search keywords"""
+    # Regex can be debugged here:
+    # http://www.regexpal.com/?fam=96564
+    regex = re.compile('#\sqid:(\d+?):\s+?(.*)')
+    rVal = {}
+    for line in lines:
+        if line[0] != '#':
+            break
+        m = re.match(regex, line)
+        if m:
+            keywordAndWeight = m.group(2).split('*')
+            keyword = keywordAndWeight[0]
+            weight = 1
+            if len(keywordAndWeight) > 1:
+                weight = int(keywordAndWeight[1])
+            rVal[int(m.group(1))] = (keyword, weight)
+
+    return rVal
+
+def _judgmentsFromBody(lines):
+    """ Parses out judgment/grade, query id, and docId in line such as:
+         4  qid:523   # a01  Grade for Rambo for query Foo
+        <judgment> qid:<queryid> # docId <rest of comment ignored...)"""
+    # Regex can be debugged here:
+    # http://www.regexpal.com/?fam=96565
+    regex = re.compile('^(\d)\s+qid:(\d+)\s+#\s+([^\s]*).*')
+    for line in lines:
+        m = re.match(regex, line)
+        if m:
+            print("%s,%s,%s" % (m.group(1), m.group(2), m.group(3)))
+            yield int(m.group(1)), int(m.group(2)), m.group(3)
+
+
+def judgmentsFromFile(filename):
+    with open(filename) as f:
+        qidToKeywords = _queriesFromHeader(f)
+    with open(filename) as f:
+        for grade, qid, docId in _judgmentsFromBody(f):
+            yield Judgment(grade=grade, qid=qid, keywords=qidToKeywords[qid][0], weight=qidToKeywords[qid][1], docId=docId)
+
+
+def judgmentsToFile(filename, judgmentsList):
+    judgToQid = judgmentsByQid(judgmentsList) #Pretty hideosly slow stuff
+    fileHeader = _queriesToHeader({qid: (judgs[0].keywords, judgs[0].weight) for qid, judgs in judgToQid.items()})
+    judgByQid = sorted(judgmentsList, key=lambda j: j.qid)
+    with open(filename, 'w+') as f:
+        f.write(fileHeader)
+        for judg in judgByQid:
+            f.write(judg.toRanklibFormat() + '\n')
+
+
+
+
+def judgmentsByQid(judgments):
+    rVal = {}
+    for judgment in judgments:
+        try:
+            rVal[judgment.qid].append(judgment)
+        except KeyError:
+            rVal[judgment.qid] = [judgment]
+    return rVal
+
+
+def duplicateJudgmentsByWeight(judgmentsByQid):
+    rVal = {}
+    from copy import deepcopy
+    maxQid = 0
+    for qid, judgments in judgmentsByQid.items():
+        maxQid = qid
+    print("maxQid %s" % maxQid)
+    for qid, judgments in judgmentsByQid.items():
+        rVal[qid] = judgments
+        if (judgments[0].weight > 1):
+            for i in range(judgments[0].weight - 1):
+                rVal[maxQid] = deepcopy(judgments)
+                for judg in judgments:
+                    judg.qid = maxQid
+                maxQid += 1
+
+
+    return rVal
+
+if __name__ == "__main__":
+    from sys import argv
+    for judgment in judgmentsFromFile(argv[1]):
+        print(judgment)
diff --git a/misc/ltr/load_features_and_model.py → misc/ltr/load_features.py b/misc/ltr/load_features_and_model.py → misc/ltr/load_features.py
@@ -10,7 +10,7 @@ def save_model(es_host, feature_set="m05"):
         "model": {
             "name": "m05_model",
             "model": {
-                "type": "model/linear"
+                "type": "model/xgboost+json"
             }
         }
     }
@@ -22,39 +22,39 @@ def save_model(es_host, feature_set="m05"):
     resp = requests.post(fullPath)
     if (resp.status_code >= 300):
         print(resp.text)
-
-    features = [f[:-5] for f in listdir("./features") if isfile(join("./features", f))]
-    modelContent = {f: 1 for f in features}
-    modelContent =  json.dumps(modelContent)
-    path = "_ltr/_featureset/%s/_createmodel" % feature_set
-    fullPath = urljoin(es_host, path)
-    modelPayload['model']['model']['definition'] = modelContent
-    print("POST %s" % fullPath)
-    resp = requests.post(fullPath, json.dumps(modelPayload), headers={"Content-Type": "application/json"})
-    print(resp.status_code)
-    if (resp.status_code >= 300):
-        print(resp.text)
+    with open('./ltr/xgboost_model.json') as modelFile:
+        modelContent = modelFile.read()
+        path = "_ltr/_featureset/%s/_createmodel" % feature_set
+        fullPath = urljoin(es_host, path)
+        modelPayload['model']['model']['definition'] = modelContent
+        print("POST %s" % fullPath)
+        resp = requests.post(fullPath, json.dumps(modelPayload), headers={"Content-Type": "application/json"})
+        print(resp.status_code)
+        if (resp.status_code >= 300):
+            print(resp.text)
 
 def get_feature(feature_name):
-    with open('./features/%s.json' % feature_name) as f:
+    with open('./ltr/features/%s.json' % feature_name) as f:
         return json.loads(f.read())
 
 def each_feature():
-    features = [f[:-5] for f in listdir("./features") if isfile(join("./features", f))]
-    try:
-        for feature in features:
-            parsedJson = get_feature(feature)
-            template = parsedJson['query']
-            feature_spec = {
-                "name": "%s" % feature,
-                "params": ["consulta"],
-                "template": template
-            }
-            print("Loading feature %s" % feature)
-            print(feature_spec)
-            yield feature_spec
-    except IOError:
-        pass
+    features = [f[:-5] for f in listdir("./ltr/features") if isfile(join("./ltr/features", f))]
+    with open('./ltr/featmap.txt', "w") as f:
+        try:
+            for idx, feature in enumerate(features):
+                parsedJson = get_feature(feature)
+                template = parsedJson['query']
+                feature_spec = {
+                    "name": "%s" % feature,
+                    "params": ["consulta"],
+                    "template": template
+                }
+                print("Loading feature %s" % feature)
+                # feature types: use i for indicator and q for quantity
+                f.write(f"{idx} {feature} q" + "\n")
+                yield feature_spec
+        except IOError:
+            pass
 
 def load_features(es_host, feature_set_name='m05'):
     feature_set = {