s2e-lab · lindsayroney · Feb 9, 2024 · Feb 29, 2024 · Mar 13, 2024 · Mar 13, 2024
diff --git a/DatasetCollection/datacollection_test.py b/DatasetCollection/datacollection_test.py
@@ -0,0 +1,42 @@
+import requests
+from bs4 import BeautifulSoup
+
+def get_table_data(table_data, html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
+    tables = soup.find_all('table', class_='searchResultsTable')
+
+    for table in tables:
+        expression_data = {}
+
+        # Extract expression, description, matches, and non-matches
+        expression_data['id'] = len(table_data) + 1 
+        details_link = table.find('a', class_='buttonSmall', href=True)
+        if details_link:
+            expression_data['details_link'] = "https://regexlib.com/"+details_link['href'].replace('RETester.aspx', 'REDetails.aspx')
+        expression_data['expression'] = table.find('tr', class_='expression').find('div', class_='expressionDiv').text.strip()
+        expression_data['description'] = table.find('tr', class_='description').find('div', class_='overflowFixDiv').text.strip()
+        expression_data['matches'] = table.find('tr', class_='matches').find('div', class_='overflowFixDiv').text.strip()
+        expression_data['non_matches'] = table.find('tr', class_='nonmatches').find('div', class_='overflowFixDiv').text.strip()
+
+        # Append data to the list
+        table_data.append(expression_data)
+
+    return table_data
+
+table_data = []
+
+for i in range(1, 43):
+    url = "https://regexlib.com/Search.aspx?k=&c=-1&m=-1&ps=100&p=" + str(i)
+    response = requests.get(url)
+    if response.status_code == 200:
+        html_content = response.text
+        table_data = get_table_data(table_data, html_content)
+        print(len(table_data))
+        print(table_data[-1])
+    else:
+        print(f"Failed to fetch data for page {i}")
+
+# Save data to JSON file
+import json
+with open('./test.json', 'w') as f:
+    json.dump(table_data, f)
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,18 @@
+# Use the continuumio/anaconda3 base image
+FROM continuumio/anaconda3
+
+# Set the working directory inside the container
+WORKDIR /app
+
+# Copy the contents of the ReDos_Benchmarking directory into the container at /app
+COPY ReDos_Benchmarking /app
+
+# Install dependencies
+RUN pip install datasets && \
+    #conda install -c huggingface -c conda-forge datasets && \
+    pip install tqdm && \
+    apt-get update && \
+    apt-get -y install gcc mono-mcs python3-dev
+
+# Specify the command to run on container start
+CMD ["python", "-c", "from datasets import load_dataset"]
diff --git a/RegexEvalDocker/Dockerfile b/RegexEvalDocker/Dockerfile
@@ -0,0 +1,18 @@
+# Use the continuumio/anaconda3 base image
+FROM continuumio/anaconda3
+
+# Set the working directory inside the container
+WORKDIR /app
+
+# Copy the contents of the ReDos_Benchmarking directory into the container at /app
+COPY ReDos_Benchmarking /app
+
+# Install dependencies
+RUN pip install datasets && \
+    #conda install -c huggingface -c conda-forge datasets && \
+    pip install tqdm && \
+    apt-get update && \
+    apt-get -y install gcc mono-mcs python3-dev
+
+# Specify the command to run on container start
+CMD ["python", "-c", "from datasets import load_dataset"]
diff --git a/RegexEvalDocker/datacollection_test.py b/RegexEvalDocker/datacollection_test.py
@@ -0,0 +1,42 @@
+import requests
+from bs4 import BeautifulSoup
+
+def get_table_data(table_data, html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
+    tables = soup.find_all('table', class_='searchResultsTable')
+
+    for table in tables:
+        expression_data = {}
+
+        # Extract expression, description, matches, and non-matches
+        expression_data['id'] = len(table_data) + 1 
+        details_link = table.find('a', class_='buttonSmall', href=True)
+        if details_link:
+            expression_data['details_link'] = "https://regexlib.com/"+details_link['href'].replace('RETester.aspx', 'REDetails.aspx')
+        expression_data['expression'] = table.find('tr', class_='expression').find('div', class_='expressionDiv').text.strip()
+        expression_data['description'] = table.find('tr', class_='description').find('div', class_='overflowFixDiv').text.strip()
+        expression_data['matches'] = table.find('tr', class_='matches').find('div', class_='overflowFixDiv').text.strip()
+        expression_data['non_matches'] = table.find('tr', class_='nonmatches').find('div', class_='overflowFixDiv').text.strip()
+
+        # Append data to the list
+        table_data.append(expression_data)
+
+    return table_data
+
+table_data = []
+
+for i in range(1, 43):
+    url = "https://regexlib.com/Search.aspx?k=&c=-1&m=-1&ps=100&p=" + str(i)
+    response = requests.get(url)
+    if response.status_code == 200:
+        html_content = response.text
+        table_data = get_table_data(table_data, html_content)
+        print(len(table_data))
+        print(table_data[-1])
+    else:
+        print(f"Failed to fetch data for page {i}")
+
+# Save data to JSON file
+import json
+with open('./test.json', 'w') as f:
+    json.dump(table_data, f)
diff --git a/RegexEvalDocker/test.json b/RegexEvalDocker/test.json