Skip to content

Commit

Permalink
dockerfile cleanup; enforce text LF line endings (#81)
Browse files Browse the repository at this point in the history
  • Loading branch information
frasergr authored Jun 18, 2023
1 parent 3945a77 commit 4079020
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 52 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* text=auto eol=lf
76 changes: 38 additions & 38 deletions collector/scripts/sitemap.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,39 @@
import requests
import xml.etree.ElementTree as ET
from scripts.link import parse_links
import re

def parse_sitemap(url):
response = requests.get(url)
root = ET.fromstring(response.content)

urls = []
for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
if not has_extension_to_ignore(loc.text):
urls.append(loc.text)
else:
print(f"Skipping filetype: {loc.text}")

return urls

# Example sitemap URL https://www.nerdwallet.com/blog/wp-sitemap-news-articles-1.xml
def sitemap():
sitemap_url = input("Enter the URL of the sitemap: ")

if(len(sitemap_url) == 0):
print("No valid sitemap provided!")
exit(1)

url_array = parse_sitemap(sitemap_url)

#parse links from array
parse_links(url_array)

def has_extension_to_ignore(string):
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.pdf']

pattern = r'\b(' + '|'.join(re.escape(ext) for ext in image_extensions) + r')\b'
match = re.search(pattern, string, re.IGNORECASE)

import requests
import xml.etree.ElementTree as ET
from scripts.link import parse_links
import re

def parse_sitemap(url):
response = requests.get(url)
root = ET.fromstring(response.content)

urls = []
for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
if not has_extension_to_ignore(loc.text):
urls.append(loc.text)
else:
print(f"Skipping filetype: {loc.text}")

return urls

# Example sitemap URL https://www.nerdwallet.com/blog/wp-sitemap-news-articles-1.xml
def sitemap():
sitemap_url = input("Enter the URL of the sitemap: ")

if(len(sitemap_url) == 0):
print("No valid sitemap provided!")
exit(1)

url_array = parse_sitemap(sitemap_url)

#parse links from array
parse_links(url_array)

def has_extension_to_ignore(string):
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.pdf']

pattern = r'\b(' + '|'.join(re.escape(ext) for ext in image_extensions) + r')\b'
match = re.search(pattern, string, re.IGNORECASE)

return match is not None
8 changes: 2 additions & 6 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,10 @@ RUN groupadd -g $ARG_GID anythingllm && \
# Copy docker helper scripts
COPY ./docker/docker-entrypoint.sh /usr/local/bin/
COPY ./docker/docker-healthcheck.sh /usr/local/bin/
COPY ./docker/dual_boot.sh /usr/local/bin/

# Ensure the scripts are executable
RUN chmod +x /usr/local/bin/docker-entrypoint.sh && \
chmod +x /usr/local/bin/docker-healthcheck.sh && \
chmod 777 /usr/local/bin/dual_boot.sh
chmod +x /usr/local/bin/docker-healthcheck.sh

USER anythingllm

Expand Down Expand Up @@ -91,6 +89,4 @@ HEALTHCHECK --interval=1m --timeout=10s --start-period=1m \
CMD /bin/bash /usr/local/bin/docker-healthcheck.sh || exit 1

# Run the server
ENTRYPOINT ["docker-entrypoint.sh"]

CMD /bin/bash /usr/local/bin/dual_boot.sh
ENTRYPOINT ["/bin/bash", "/usr/local/bin/docker-entrypoint.sh"]
8 changes: 5 additions & 3 deletions docker/docker-entrypoint.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/env bash

exec "$@"
#!/bin/bash
node /app/server/index.js &
{ FLASK_ENV=production FLASK_APP=wsgi.py cd collector && gunicorn --workers 4 --bind 0.0.0.0:8888 wsgi:api; } &
wait -n
exit $?
5 changes: 0 additions & 5 deletions docker/dual_boot.sh

This file was deleted.

0 comments on commit 4079020

Please sign in to comment.