Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 184 - Adiciona instalação, configuração e incluir aprimoramentos de LTR #185

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,4 @@ wasabi==0.9.0
Werkzeug==2.0.3
wrapt==1.14.0
zipp==3.7.0
xgboost==1.6.2
1 change: 1 addition & 0 deletions indexer/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,4 @@ transformers==4.22.2
typing_extensions==4.3.0
urllib3==1.26.12
zipp==3.8.1
xgboost==1.6.2
38 changes: 28 additions & 10 deletions install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,23 @@ export LD_RUN_PATH=/usr/local/lib
########################################

# Installing Requirements
pip install --use-deprecated=legacy-resolver -r requirements.txt
pip install --use-deprecated=legacy-resolver -r ./indexer/requirements.txt
if ! [[ -z $(pip freeze | grep "torch==1.6.0+cu101") ]]; then
pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
fi

########################################

# Install ElasticSearch
# Install ElasticSearch and LTR Plugin
echo "Check if ElasticSearch is installed..."
if ! [[ -d "elasticsearch-7.10.2" ]]; then
rm elasticsearch-7.10.2-linux-x86_64.tar.gz*
echo "Installing ElasticSearch..."
wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.10.2-linux-x86_64.tar.gz
tar -xvzf elasticsearch-7.10.2-linux-x86_64.tar.gz
rm elasticsearch-7.10.2-linux-x86_64.tar.gz
fi

########################################

# Setting Elasticsearch
echo "Checking if ElasticSearch is ON..."
if [[ -z $(fuser 9200/tcp) ]]; then
echo "Initializing ElasticSearch..."
echo "Initializing ElasticSearch to install Plugin"
cd ./elasticsearch-7.10.2

# remove configuracoes de segurança
Expand All @@ -39,19 +33,43 @@ if [[ -z $(fuser 9200/tcp) ]]; then
ES_JAVA_OPTS=-Xmx2g ./bin/elasticsearch -d
echo "Waiting ElasticSearch..."
sleep 60s
./bin/elasticsearch-plugin install https://github.com/o19s/elasticsearch-learning-to-rank/releases/download/v1.5.4-es7.10.2/ltr-plugin-v1.5.4-es7.10.2.zip
fuser -k 9200/tcp

cd ..
fi

########################################

# Indexing Documents
# Setting Elasticsearch
echo "Checking if ElasticSearch is ON..."
if [[ -z $(fuser 9200/tcp) ]]; then
echo "Initializing ElasticSearch..."
cd ./elasticsearch-7.10.2
ES_JAVA_OPTS=-Xmx2g ./bin/elasticsearch -d
echo "Waiting ElasticSearch..."
sleep 60s
cd ..
fi
# depois de instalado, tem de reiniciar o ES para funcionar
########################################

#Indexing Documents
echo ""
echo "Indexing documents..."
cd ./indexer
if [[ -z $(curl -X GET "localhost:9200/_cat/indices/*") ]]; then
./indexing_script.sh
fi
########################################

# Setting Up LTR
export PYTHONPATH=./
echo ""
echo "Setting Up LTR..."
cd ..
python ./misc/ltr/set_ltr.py
#load trained model
########################################

docker-compose build
4 changes: 3 additions & 1 deletion misc/feedback/candidates_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from tqdm import tqdm
import requests

CLIENT = "gsi"

def format_text(text):
import re
text = re.sub('\n+', '\n', text)
Expand All @@ -30,7 +32,7 @@ def apply_similarity(method):
try:
query = q["query"].replace("/", "")
params = {'consulta': query, 'page': 1, 'sid': '123'}
service_response = requests.get('http://localhost:8000/services/search', params)
service_response = requests.get(f'http://localhost:8000/services/{CLIENT}/search', params)
results = json.loads(service_response.text)
feedback_line = {}
feedback_line["text"] = query
Expand Down
6 changes: 3 additions & 3 deletions misc/feedback/candidates_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,13 @@
for person, data_list in feedback.items():
for entry in data_list:
new_corresponding = ""
query_tokens = entry["query"].split()
query_tokens = entry["query"].lower().split()
query_tokens = [token for token in query_tokens if not token in stopwords]
color_codes = dict(zip(query_tokens, style_list[:len(query_tokens)]))
for corres in entry["corresponding"]:
for token in corres["text"].split():
if token in query_tokens:
new_corresponding += f"<mark {color_codes[token]}>" + token + "</mark> "
if token.lower() in query_tokens:
new_corresponding += f"<mark {color_codes[token.lower()]}>" + token + "</mark> "
else:
new_corresponding += token + " "
corres["formatted_text"] = new_corresponding
Expand Down
14 changes: 8 additions & 6 deletions misc/ltr/collect_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,14 @@ def featureDictToList(ranklibLabeledFeatures):

def log_features(es, judgmentsByQid):
for qid, judgments in judgmentsByQid.items():
query = judgments[0].query
query = judgments[0].keywords
doc_ids = [judgment.docId for judgment in judgments]
logQuery['query']['bool']['must'][0]['terms']['_id'] = doc_ids
logQuery['query']['bool']['should'][0]['sltr']['params']['consulta'] = query
print("POST")
print(json.dumps(logQuery, indent=2))
res = es.search(index='tmdb', body=logQuery)
#revisar essa parte
res = es.search(index='processos', body=logQuery)
# Add feature back to each judgment
featuresPerDoc = {}
for doc in res['hits']['hits']:
Expand All @@ -68,7 +69,7 @@ def log_features(es, judgmentsByQid):
features = featuresPerDoc[judgment.docId] # If KeyError, then we have a judgment but no movie in index
judgment.features = features
except KeyError:
print("Missing movie %s" % judgment.docId)
print("Missing document %s" % judgment.docId)


def build_train_file(judgmentsWithFeatures, filename):
Expand All @@ -79,9 +80,10 @@ def build_train_file(judgmentsWithFeatures, filename):


if __name__ == "__main__":
from judgments import judgmentsFromFile, judgmentsByQid
from ltr.judgments import judgmentsFromFile, judgmentsByQid
from elasticsearch import Elasticsearch
es = Elasticsearch()
judgmentsByQid = judgmentsByQid(judgmentsFromFile('sample_judgments.txt'))
judgmentsByQid = judgmentsByQid(judgmentsFromFile('./ltr/human_judgments.txt'))
print(judgmentsByQid)
log_features(es, judgmentsByQid)
build_train_file(judgmentsByQid, "sample_judgments_wfeatures.txt")
build_train_file(judgmentsByQid, "./ltr/human_judgments_wfeatures.txt")
32 changes: 32 additions & 0 deletions misc/ltr/human_judgments.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# qid:1: não gerou documento*1
# qid:2: documento*1
# qid:3: processo*1

0 qid:3 # 90d770145b6eef3274e81ae18727498a5d02a73a processo
2 qid:3 # 0233b59d4625333383eed758b767bde71f34114a processo
4 qid:3 # d8f55c963d088b52f86e4e65311ef33152ff0781 processo
2 qid:3 # 72a65ed97b3d4e104cc86d9ead3bbfbdb44b493f processo
0 qid:3 # 90d770145b6eef3274e81ae18727498a5d02a73a processo
2 qid:2 # 0233b59d4625333383eed758b767bde71f34114a documento
4 qid:2 # d8f55c963d088b52f86e4e65311ef33152ff0781 documento
2 qid:2 # 72a65ed97b3d4e104cc86d9ead3bbfbdb44b493f documento
0 qid:2 # 90d770145b6eef3274e81ae18727498a5d02a73a documento
2 qid:2 # 0233b59d4625333383eed758b767bde71f34114a documento
4 qid:2 # d8f55c963d088b52f86e4e65311ef33152ff0781 documento
1 qid:1 # 72a65ed97b3d4e104cc86d9ead3bbfbdb44b493f não gerou documento
0 qid:1 # 90d770145b6eef3274e81ae18727498a5d02a73a não gerou documento
2 qid:1 # 0233b59d4625333383eed758b767bde71f34114a não gerou documento
4 qid:1 # d8f55c963d088b52f86e4e65311ef33152ff0781 não gerou documento
1 qid:1 # 72a65ed97b3d4e104cc86d9ead3bbfbdb44b493f não gerou documento
0 qid:1 # 90d770145b6eef3274e81ae18727498a5d02a73a não gerou documento
2 qid:1 # 0233b59d4625333383eed758b767bde71f34114a não gerou documento
4 qid:1 # d8f55c963d088b52f86e4e65311ef33152ff0781 não gerou documento
1 qid:1 # 72a65ed97b3d4e104cc86d9ead3bbfbdb44b493f não gerou documento
0 qid:1 # 90d770145b6eef3274e81ae18727498a5d02a73a não gerou documento
2 qid:1 # 0233b59d4625333383eed758b767bde71f34114a não gerou documento
4 qid:1 # d8f55c963d088b52f86e4e65311ef33152ff0781 não gerou documento
1 qid:1 # 72a65ed97b3d4e104cc86d9ead3bbfbdb44b493f não gerou documento
0 qid:1 # 90d770145b6eef3274e81ae18727498a5d02a73a não gerou documento
2 qid:1 # 0233b59d4625333383eed758b767bde71f34114a não gerou documento
4 qid:1 # d8f55c963d088b52f86e4e65311ef33152ff0781 não gerou documento
1 qid:1 # 72a65ed97b3d4e104cc86d9ead3bbfbdb44b493f não gerou documento
Empty file removed misc/ltr/judgements.txt
Empty file.
122 changes: 122 additions & 0 deletions misc/ltr/judgments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
import re

class Judgment:
def __init__(self, grade, qid, keywords, docId, weight=1):
self.grade = grade
self.qid = qid
self.keywords = keywords
self.docId = docId
self.features = [] # 0th feature is ranklib feature 1
self.weight = weight

def sameQueryAndDoc(self, other):
return self.qid == other.qid and self.docId == other.docId

def __str__(self):
return "grade:%s qid:%s (%s) docid:%s" % (self.grade, self.qid, self.keywords, self.docId)

def toRanklibFormat(self):
featuresAsStrs = ["%s:%s" % (idx+1, feature) for idx, feature in enumerate(self.features)]
comment = "# %s\t%s" % (self.docId, self.keywords)
return "%s\tqid:%s\t%s %s" % (self.grade, self.qid, "\t".join(featuresAsStrs), comment)


def _queriesToHeader(qidToKwDict):
rVal = ""
for qid, kws in qidToKwDict.items():
rVal += "# qid:%s: %s" % (qid, kws[0])
rVal += "*%s\n" % kws[1]
rVal += "\n"
return rVal


def _queriesFromHeader(lines):
""" Parses out mapping between, query id and user keywords
from header comments, ie:
# qid:523: First Blood
returns dict mapping all query ids to search keywords"""
# Regex can be debugged here:
# http://www.regexpal.com/?fam=96564
regex = re.compile('#\sqid:(\d+?):\s+?(.*)')
rVal = {}
for line in lines:
if line[0] != '#':
break
m = re.match(regex, line)
if m:
keywordAndWeight = m.group(2).split('*')
keyword = keywordAndWeight[0]
weight = 1
if len(keywordAndWeight) > 1:
weight = int(keywordAndWeight[1])
rVal[int(m.group(1))] = (keyword, weight)

return rVal

def _judgmentsFromBody(lines):
""" Parses out judgment/grade, query id, and docId in line such as:
4 qid:523 # a01 Grade for Rambo for query Foo
<judgment> qid:<queryid> # docId <rest of comment ignored...)"""
# Regex can be debugged here:
# http://www.regexpal.com/?fam=96565
regex = re.compile('^(\d)\s+qid:(\d+)\s+#\s+([^\s]*).*')
for line in lines:
m = re.match(regex, line)
if m:
print("%s,%s,%s" % (m.group(1), m.group(2), m.group(3)))
yield int(m.group(1)), int(m.group(2)), m.group(3)


def judgmentsFromFile(filename):
with open(filename) as f:
qidToKeywords = _queriesFromHeader(f)
with open(filename) as f:
for grade, qid, docId in _judgmentsFromBody(f):
yield Judgment(grade=grade, qid=qid, keywords=qidToKeywords[qid][0], weight=qidToKeywords[qid][1], docId=docId)


def judgmentsToFile(filename, judgmentsList):
judgToQid = judgmentsByQid(judgmentsList) #Pretty hideosly slow stuff
fileHeader = _queriesToHeader({qid: (judgs[0].keywords, judgs[0].weight) for qid, judgs in judgToQid.items()})
judgByQid = sorted(judgmentsList, key=lambda j: j.qid)
with open(filename, 'w+') as f:
f.write(fileHeader)
for judg in judgByQid:
f.write(judg.toRanklibFormat() + '\n')




def judgmentsByQid(judgments):
rVal = {}
for judgment in judgments:
try:
rVal[judgment.qid].append(judgment)
except KeyError:
rVal[judgment.qid] = [judgment]
return rVal


def duplicateJudgmentsByWeight(judgmentsByQid):
rVal = {}
from copy import deepcopy
maxQid = 0
for qid, judgments in judgmentsByQid.items():
maxQid = qid
print("maxQid %s" % maxQid)
for qid, judgments in judgmentsByQid.items():
rVal[qid] = judgments
if (judgments[0].weight > 1):
for i in range(judgments[0].weight - 1):
rVal[maxQid] = deepcopy(judgments)
for judg in judgments:
judg.qid = maxQid
maxQid += 1


return rVal

if __name__ == "__main__":
from sys import argv
for judgment in judgmentsFromFile(argv[1]):
print(judgment)
58 changes: 29 additions & 29 deletions misc/ltr/load_features_and_model.py → misc/ltr/load_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def save_model(es_host, feature_set="m05"):
"model": {
"name": "m05_model",
"model": {
"type": "model/linear"
"type": "model/xgboost+json"
}
}
}
Expand All @@ -22,39 +22,39 @@ def save_model(es_host, feature_set="m05"):
resp = requests.post(fullPath)
if (resp.status_code >= 300):
print(resp.text)

features = [f[:-5] for f in listdir("./features") if isfile(join("./features", f))]
modelContent = {f: 1 for f in features}
modelContent = json.dumps(modelContent)
path = "_ltr/_featureset/%s/_createmodel" % feature_set
fullPath = urljoin(es_host, path)
modelPayload['model']['model']['definition'] = modelContent
print("POST %s" % fullPath)
resp = requests.post(fullPath, json.dumps(modelPayload), headers={"Content-Type": "application/json"})
print(resp.status_code)
if (resp.status_code >= 300):
print(resp.text)
with open('./ltr/xgboost_model.json') as modelFile:
modelContent = modelFile.read()
path = "_ltr/_featureset/%s/_createmodel" % feature_set
fullPath = urljoin(es_host, path)
modelPayload['model']['model']['definition'] = modelContent
print("POST %s" % fullPath)
resp = requests.post(fullPath, json.dumps(modelPayload), headers={"Content-Type": "application/json"})
print(resp.status_code)
if (resp.status_code >= 300):
print(resp.text)

def get_feature(feature_name):
with open('./features/%s.json' % feature_name) as f:
with open('./ltr/features/%s.json' % feature_name) as f:
return json.loads(f.read())

def each_feature():
features = [f[:-5] for f in listdir("./features") if isfile(join("./features", f))]
try:
for feature in features:
parsedJson = get_feature(feature)
template = parsedJson['query']
feature_spec = {
"name": "%s" % feature,
"params": ["consulta"],
"template": template
}
print("Loading feature %s" % feature)
print(feature_spec)
yield feature_spec
except IOError:
pass
features = [f[:-5] for f in listdir("./ltr/features") if isfile(join("./ltr/features", f))]
with open('./ltr/featmap.txt', "w") as f:
try:
for idx, feature in enumerate(features):
parsedJson = get_feature(feature)
template = parsedJson['query']
feature_spec = {
"name": "%s" % feature,
"params": ["consulta"],
"template": template
}
print("Loading feature %s" % feature)
# feature types: use i for indicator and q for quantity
f.write(f"{idx} {feature} q" + "\n")
yield feature_spec
except IOError:
pass

def load_features(es_host, feature_set_name='m05'):
feature_set = {
Expand Down
Loading