Skip to content

Commit

Permalink
renamed outpath directory
Browse files Browse the repository at this point in the history
  • Loading branch information
mahinth1 committed Oct 23, 2024
1 parent d761b2e commit 9754b79
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 10 deletions.
1 change: 1 addition & 0 deletions stages/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/brick
16 changes: 7 additions & 9 deletions stages/01_get_openaccess.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
import json
import shutil

#file_lock = multiprocessing.Lock()

#### Define functions #####
#####vFunctions #####

# Get last processed date if applicable
def get_last_processed_date():
last_processed_file = Path('last_processed_date.txt')
last_processed_file = Path('stages/last_processed_date.txt')
if last_processed_file.exists():
with open(last_processed_file, 'r') as file:
return datetime.strptime(file.read(), "%Y-%m-%d").date()
Expand Down Expand Up @@ -106,14 +106,11 @@ def process_file(file_info):

# Save last processed date into a file
def save_last_processed_date(processed_date):
with open('last_processed_date.txt', 'w') as file:
with open('stages/last_processed_date.txt', 'w') as file:
file.write(processed_date.strftime("%Y-%m-%d"))


# Check the output parquet files and removed duplicates


#### Execution ######
def remove_duplicates(parquet_dir):
parquet_dir = Path(parquet_dir)
parquet_files = parquet_dir.glob('*.parquet')
Expand Down Expand Up @@ -142,13 +139,14 @@ def remove_duplicates(parquet_dir):
print("\nDuplicate doi have been removed.")


##### Execution #####

# retrieve last processed date or its arbitrary date
last_processed_date = get_last_processed_date()
print(last_processed_date)

# Create directory for the processed data
raw_path = Path('brick')
raw_path = Path('brick/articles.parquet')
raw_path.mkdir(exist_ok=True)

# Number of output files to split into (can be more or less)
Expand Down Expand Up @@ -179,4 +177,4 @@ def remove_duplicates(parquet_dir):


# Check the output parquet files and remove duplicates
remove_duplicates(raw_path)
remove_duplicates(raw_path)
2 changes: 1 addition & 1 deletion stages/02_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def extract_metadata(doi):
##### Execution #####

# input and output directories
input_dir = Path('stages/brick')
input_dir = Path('stages/brick/articles.parquet')
output_dir = Path('stages/brick/pdfs')
output_dir.mkdir(parents=True, exist_ok=True)

Expand Down
6 changes: 6 additions & 0 deletions stages/brick.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
outs:
- md5: 8af8f3b91782fa9b271166c34107642f.dir
size: 13369438821
nfiles: 5132
hash: md5
path: brick

0 comments on commit 9754b79

Please sign in to comment.