Skip to content

Commit

Permalink
i #284 Refactor download_pipermail function
Browse files Browse the repository at this point in the history
- Remove archive_url and archive_type parameters from download_pipermail().
- Add start_year_month and end_year_month parameters for date filtering.
- Remove convert_pipermail_to_mbox() function, as download_pipermail() now handles file conversion automatically.
- Change file naming convention to 'kaiaulu_'YYYYMM.mbox'.
- Attempt to download and decompress files directly without saving .gz to disk, but could not establish a valid connection.

Signed-off-by: Dao McGill <[email protected]>
  • Loading branch information
daomcgill committed Sep 15, 2024
1 parent d2ce222 commit 7c585ae
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 50 deletions.
136 changes: 86 additions & 50 deletions R/mail.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,63 +7,99 @@
############## Downloader ##############

#' Download all pipermail files in an archive as mbox files
#' @param archive_url An url pointing to a pipermail archive
#' @param mailing_list The name of the mailing list being downloaded
#' @param archive_type The name of the type of archive that the mailing list is stored in
#' @param start_year_month The year and month of the first file to be downloaded
#' @param end_year_month The year and month of the last file to be downloaded
#' @param save_folder_path The folder path in which all the downloaded pipermail files will be stored
#' @return Returns `destination`, a vector of the downloaded files in the current working directory
#' @return Returns `downloaded_files`, a vector of the downloaded files in the current working directory
#' @export
download_pipermail <- function(archive_url, mailing_list, archive_type, save_folder_path) {

#Get page
pagedata <- httr::GET(archive_url)

#Parse html file into object
tbls_xml <- XML::htmlParse(pagedata)

#Extract href tablenodes from html table
tableNodes <- XML::getNodeSet(tbls_xml, "//td/a[@href]")

#Extract filenames from tablenode content with xmlGetAtrr
hrefs <- sapply(tableNodes, XML::xmlGetAttr, 'href')

#Create Vector
files <- vector()
file_names <- vector()

#Compose download urls for both gunzipped and plain text files
for (i in hrefs ){
if (endsWith(i, ".txt.gz")){
# Converts month from text form into a number for the naming convention
f_month <- match(sub("[^_]*-","", sub(".txt.gz","",i)), month.name)
# Retrieves year number for the naming convention
f_year <- sub("-[^_]*", "", i)
# txt files are actually mbox files, so this renames the extension
file_names <- c(file_names, sprintf("%s%02d.mbox", f_year, f_month))
# Saves regular name so that function can access correct url
i <- stringi::stri_c(archive_url, i, sep = "/")
files <- c(files, i)
} else if (endsWith(i, ".txt")) {
# Same logic, but with txt
f_month <- match(sub("[^_]*-","", sub(".txt","",i)), month.name)
f_year <- sub("-[^_]*", "", i)
file_names <- c(file_names, sprintf("%s%02d.mbox", f_year, f_month))
i <- stringi::stri_c(archive_url, i, sep = "/")
files <- c(files, i)
download_pipermail <- function(mailing_list, start_year_month, end_year_month, save_folder_path) {

# Create directory if it does not exist
if (!dir.exists(save_folder_path)) {
dir.create(save_folder_path, recursive = TRUE)
}

# Get mailing list contents
response <- GET(mailing_list)

# Parse the response
parsed_response <- content(response, "text")
doc_obj <- htmlParse(parsed_response, asText = TRUE)

# Table rows
rows <- getNodeSet(doc_obj, "//tr")

# Skip header row
data_rows <- rows[-1]

# Vector for link storage
links = c()

# Extract the date and link from each row
for (row in data_rows) {
# Date in YYYYMM format
date_extracted <- xpathSApply(row, ".//td[1]", xmlValue)
date_cleaned <- stri_replace_last_regex(date_extracted, pattern = ":$", replacement = "")
date_cleaned <- stri_trim_both(date_cleaned)
# Parse the date
# Add 01 as dummy to make it a valid date
date_parsed <- as.Date(paste0("01 ", date_cleaned), format = "%d %B %Y")
year_month <- format(date_parsed, "%Y%m")

# Check if date is within range
if (year_month >= start_year_month & year_month <= end_year_month) {
# get href from column 3
link_nodes <- xpathSApply(row, ".//td[3]/a", xmlGetAttr, 'href')
# Store the link in links
link <- link_nodes[1]
links <- c(links, link)
}
}
amount <- length(files)
# File downloading loop
for (i in 1:amount){

#download file and place it at the destination
save_file_name <- stringi::stri_c(mailing_list, archive_type, file_names[[i]], sep = "_")
save_file_path <- stringi::stri_c(save_folder_path, save_file_name, sep = "/")
httr::GET(files[[i]], httr::write_disk(save_file_path, overwrite=TRUE))
# Vector for downloaded files
downloaded_files <- c()
for (i in seq_along(links)) {
link <- links[i]

# Extract the name without the .txt.gz extension
base_name <- gsub("\\.txt\\.gz$", "", link)

# Parse the date from the base name
date_parsed <- as.Date(paste0("01-", base_name), format = "%d-%Y-%B")
year_month_clean <- format(date_parsed, "%Y%m")

# Download URL
download_url <- paste0(mailing_list, link)

# Define the destination file
# Rename (also converts to mbox by changing extension to .mbox)
dest_gz <- file.path(save_folder_path, paste0('kaiaulu_', year_month_clean, '.mbox.gz'))
dest <- file.path(save_folder_path, paste0('kaiaulu_', year_month_clean, '.mbox'))

# Download the gz mbox file
cat("Downloading:", download_url, "\n")
GET(download_url, write_disk(dest_gz, overwrite = TRUE))

# Unzip the file
gz_con <- gzfile(dest_gz, open = "rb")
out_con <- file(dest, open = "wb")
while (TRUE) {
bytes <- readBin(gz_con, what = raw(), n = 1024 * 1024)
if (length(bytes) == 0) break
writeBin(bytes, out_con)
}
close(gz_con)
close(out_con)

# Remove the gz file
file.remove(dest_gz)

# Add the downloaded file to the list
downloaded_files <- c(downloaded_files, dest)
}

#Return filenames
return(save_folder_path)
# Return downloaded files
return(downloaded_files)

}

Expand Down
8 changes: 8 additions & 0 deletions conf/helix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ mailing_list:
mbox: ../../rawdata/helix/mod_mbox/helix-user/
mailing_list: helix-user
archive_type: apache
# Using for testing R/mail.R/pipermail_downloader()
pipermail_key:
archive_url: https://mta.openssl.org/mailman/listinfo/
mailing_list: https://mta.openssl.org/pipermail/openssl-users/
# archive_type
start_year_month: 202310
end_year_month: 202405
save_folder_path: "save_folder_mail"

issue_tracker:
jira:
Expand Down

0 comments on commit 7c585ae

Please sign in to comment.