Skip to content

Commit

Permalink
Fix esbulk errors
Browse files Browse the repository at this point in the history
esbulk seems to have been causing errors with the previous settings.
We did not see them before because they do not halt the image building
process, but the db becomes inconsistent, as documents fail to import.

According to issues in the library github:

miku/esbulk#33

and the readme:

https://github.com/miku/esbulk?tab=readme-ov-file#esbulk

Using default settings might solve those errors. So far, it seems to
be working.
  • Loading branch information
javfg authored and jdhayhurst committed Sep 25, 2024
1 parent e875e09 commit a12b778
Showing 1 changed file with 9 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ function run_elasticsearch() {
-e "bootstrap.memory_lock=true" \
-e "search.max_open_scroll_context=5000" \
-e ES_JAVA_OPTS="-Xms${JVM_SIZE_HALF}g -Xmx${JVM_SIZE_HALF}g" \
-e "thread_pool.write.queue_size=1000" \
-e "thread_pool.write.queue_size=-1" \
-v ${pos_es_docker_vol_data}:/usr/share/elasticsearch/data \
-v ${pos_es_docker_vol_logs}:/usr/share/elasticsearch/logs \
--ulimit memlock=-1:-1 \
Expand Down Expand Up @@ -88,19 +88,19 @@ function load_data_into_es_index() {
#log "[INFO][${index_name}] Loading data file '${file}' with id '${id}'"
for ((i = 1; i <= max_retries; i++)); do
log "[INFO][${index_name}] Loading data file '${file}' with id '${id}' - Attempt #$i"
gsutil cp "${file}" - | esbulk -size 2000 -w 4 -index "${index_name}" -type _doc -server http://localhost:9200 -id "${id}" && break || log "[ERROR][${index_name}] Loading data file '${file}' with id '${id}' - FAILED Attempt #$i, retrying..."
gsutil cp "${file}" - | esbulk -index "${index_name}" -type _doc -server http://localhost:9200 -id "${id}" && break || log "[ERROR][${index_name}] Loading data file '${file}' with id '${id}' - FAILED Attempt #$i, retrying..."
sleep 1
done
if [ $i -gt $max_retries ]; then
log "[ERROR][${index_name}] Loading data file '${file}' with id '${id}' - ALL ATTEMPTS FAILED."
return 1
fi
#gsutil cp ${file} - | esbulk -size 2000 -w 8 -index ${index_name} -type _doc -server http://localhost:9200 -id ${id}
#gsutil cp ${file} - | esbulk -size 2000 -w 8 -index ${index_name} -type _doc -server http://localhost:9200 -id ${id}
else
#log "[INFO][${index_name}] Loading data file '${file}' WITHOUT id"
for ((i = 1; i <= max_retries; i++)); do
log "[INFO][${index_name}] Loading data file '${file}' WITHOUT id - Attempt #$i"
gsutil cp ${file} - | esbulk -size 2000 -w 4 -index ${index_name} -type _doc -server http://localhost:9200 && break || log "[ERROR][${index_name}] Loading data file '${file}' WITHOUT id - FAILED Attempt #$i, retrying..."
gsutil cp "${file}" - | esbulk -index "${index_name}" -type _doc -server http://localhost:9200 && break || log "[ERROR][${index_name}] Loading data file '${file}' WITHOUT id - FAILED Attempt #$i, retrying..."
sleep 1
done
if [ $i -gt $max_retries ]; then
Expand Down Expand Up @@ -174,7 +174,7 @@ function do_load_etl_data_into_es_parallel() {
) &
fi
done

# Wait for all background jobs to complete
wait

Expand Down Expand Up @@ -218,13 +218,13 @@ function do_load_etl_data_into_es_sequential() {
fi
fi
done

if $all_jobs_done; then
break
else
sleep 1
fi
done
done
}


Expand Down Expand Up @@ -265,7 +265,7 @@ function load_etl_data_into_es() {
job_param_index_settings["$index_name"]=${index_settings}
job_param_id["$index_name"]=${id}
done < "${pos_es_path_etl_ingestion_config}"

# Add the jobs for loading Evidence data into Elastic Search
for evidence_path in $( gsutil ls ${pos_data_release_path_etl_json}/evidence | grep sourceId ); do
# Extract the sourceId from the path
Expand Down Expand Up @@ -327,4 +327,4 @@ load_etl_data_into_es
# Print Elastic Search summary
print_es_summary
# Stop Elastic Search
stop_elasticsearch
stop_elasticsearch

0 comments on commit a12b778

Please sign in to comment.