Skip to content

Commit

Permalink
RR-44 Use title for crossref if no doi response (#2)
Browse files Browse the repository at this point in the history
* add unit tests
and crossref data by title function

* god i love json

* fixing bugs
  • Loading branch information
tmwclaxton authored Dec 28, 2023
1 parent 5013af8 commit e00533d
Show file tree
Hide file tree
Showing 7 changed files with 224 additions and 50 deletions.
3 changes: 2 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ REQUEUE_REQUESTS=false
DISPATCHER_MAX_MESSAGES=10
DISPATCHER_VISIBILITY_TIMEOUT=30
DISPATCHER_WAIT_TIME_SECONDS=20
GROBID_URL=http://grobid:8070
GROBID_URL=http://grobid:8070
GRACE_PERIOD_WORKERS: 3
23 changes: 19 additions & 4 deletions internal/dispatcher/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,18 @@ func Worker(id int, messageQueue <-chan *sqs.Message, svc *sqs.SQS, sqsURL, s3Bu
log.Fatalf("Error parsing MINIMUM_GAP_BETWEEN_REQUESTS_SECONDS: %v", err)
}
gracePeriodRequests, _ := strconv.Atoi(helpers.GetEnvVariable("GRACE_PERIOD_REQUESTS"))
allowedWorkers, _ := strconv.Atoi(helpers.GetEnvVariable("GRACE_PERIOD_WORKERS"))

log.Printf("Starting worker %d...\n", id)

for {
message := <-messageQueue

if totalRequests < gracePeriodRequests {
// if worker id is greater than the allowed workers then return
if id > allowedWorkers {
log.Printf("Worker %d is greater than the allowed workers (%d), returning...\n", id, allowedWorkers)
return
}
// Acquire a semaphore before accessing
if err := grobidSemaphore.Acquire(context.Background(), 1); err != nil {
log.Printf("Worker %d could not acquire semaphore: %v\n", id, err)
Expand All @@ -69,6 +74,7 @@ func Worker(id int, messageQueue <-chan *sqs.Message, svc *sqs.SQS, sqsURL, s3Bu
log.Printf("Worker %d releasing semaphore\n", id)
}

message := <-messageQueue
processMessage(id, message, svc, sqsURL, s3Bucket, awsRegion, s)
}
}
Expand Down Expand Up @@ -168,11 +174,20 @@ func processMessage(id int, message *sqs.Message, svc *sqs.SQS, sqsURL, s3Bucket
}

crossRefResponse := &parsing.TidyCrossRefResponse{}
// cross reference data using the DOI

// Cross reference data using the DOI
if tidyGrobidResponse.Doi != "" {
crossRefResponse, err = parsing.CrossReferenceData(tidyGrobidResponse.Doi)
crossRefResponse, err = parsing.CrossRefDataDOI(tidyGrobidResponse.Doi)
if err != nil {
log.Println("Error cross referencing data using DOI:", err)
}
}

// If DOI is not available or failed, try cross-referencing using Title
if crossRefResponse.DOI == "" && tidyGrobidResponse.Title != "" {
crossRefResponse, err = parsing.CrossRefDataTitle(tidyGrobidResponse.Title)
if err != nil {
log.Println("Error cross referencing data:", err)
log.Println("Error cross referencing data using Title:", err)
}
}

Expand Down
137 changes: 93 additions & 44 deletions internal/parsing/crossref.go
Original file line number Diff line number Diff line change
@@ -1,47 +1,62 @@
package parsing

import (
"encoding/xml"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"log"
"net/http"
"regexp"
"strings"
)

type CrudeCrossRefResponse struct {
Raw string `xml:",innerxml"`
Title string `xml:"query_result>body>query>doi_record>crossref>journal>journal_article>titles>title"`
Year string `xml:"query_result>body>query>doi_record>crossref>journal>journal_issue>publication_date>year"`
}

type AbstractTemp struct {
JATS []JATs `xml:"query_result>body>query>doi_record>crossref>journal>journal_article"`
type TidyCrossRefResponse struct {
Title string `json:"title"`
Year string `json:"year"`
Abstract string `json:"abstract"`
DOI string `json:"doi"`
ISSN string `json:"issn"`
}

type JATs struct {
RawContent string `xml:",innerxml"`
// CrossRefDOIResponse god i love json
type CrossRefDOIResponse struct {
Message struct {
DOI string `json:"DOI"`
Title []string `json:"title"`
Abstract string `json:"abstract"`
ISSN []string `json:"ISSN"`
Issued struct {
DateParts [][]int `json:"date-parts"`
}
} `json:"message"`
}

type TidyCrossRefResponse struct {
Title string `json:"title"`
Year string `json:"year"`
Abstract string `xml:"abstract>p"`
// CrossRefTitleResponse god i love json
type CrossRefTitleResponse struct {
Message struct {
Items []struct {
DOI string `json:"DOI"`
Title []string `json:"title"`
Abstract string `json:"abstract"`
ISSN []string `json:"ISSN"`
Issued struct {
DateParts [][]int `json:"date-parts"`
} `json:"issued"`
} `json:"items"`
} `json:"message"`
}

func CrossReferenceData(doi string) (*TidyCrossRefResponse, error) {
func CrossRefDataDOI(doi string) (*TidyCrossRefResponse, error) {
log.Printf("Cross referencing data for DOI: %s\n", doi)

client := &http.Client{}

var response *http.Response
var err error

response, err = client.Get("https://api.crossref.org/works/" + doi + "/transform/application/vnd.crossref.unixsd+xml")
response, err = client.Get("https://api.crossref.org/works/" + doi)
if err != nil {
return nil, err
return &TidyCrossRefResponse{}, err
}

defer func(Body io.ReadCloser) {
Expand All @@ -51,44 +66,78 @@ func CrossReferenceData(doi string) (*TidyCrossRefResponse, error) {
}
}(response.Body)

xmlBytes, err := ioutil.ReadAll(response.Body)
// Parse JSON response
var crossRefResponse CrossRefDOIResponse
err = json.NewDecoder(response.Body).Decode(&crossRefResponse)
if err != nil {
return nil, err
return &TidyCrossRefResponse{}, err
}
//log.Printf("Crossref response: %s\n", xmlBytes)

var crudeResponse CrudeCrossRefResponse
err = xml.Unmarshal(xmlBytes, &crudeResponse)
if err != nil {
return nil, err
// Extract data from the response
item := crossRefResponse.Message
tidyCrossRefResponse := &TidyCrossRefResponse{
Title: item.Title[0],
Year: fmt.Sprintf("%d", item.Issued.DateParts[0][0]), // assuming the date-parts contain the year
Abstract: item.Abstract,
DOI: item.DOI,
ISSN: item.ISSN[0],
}

tidyCrossRefResponse := TidyCrossRefData(&crudeResponse, &AbstractTemp{})
if tidyCrossRefResponse.Abstract != "" {
tidyCrossRefResponse.Abstract = strings.TrimSpace(tidyCrossRefResponse.Abstract)
re := regexp.MustCompile(`\s+`)
tidyCrossRefResponse.Abstract = re.ReplaceAllString(tidyCrossRefResponse.Abstract, " ")
}

return tidyCrossRefResponse, nil
}

func TidyCrossRefData(crudeResponse *CrudeCrossRefResponse, AbstractTemp *AbstractTemp) *TidyCrossRefResponse {
// if there is more than one journal article find the one with the namespace or prefix of http://www.ncbi.nlm.nih.gov/JATS1
var abstract string
if len(AbstractTemp.JATS) > 1 {
for _, journalArticle := range AbstractTemp.JATS {
if journalArticle.RawContent[:len("http://www.ncbi.nlm.nih.gov/JATS1")] == "http://www.ncbi.nlm.nih.gov/JATS1" {
log.Printf("Found journal article with namespace: %s\n", journalArticle.RawContent[:len("http://www.ncbi.nlm.nih.gov/JATS1")])
abstract = journalArticle.RawContent
}
func CrossRefDataTitle(title string) (*TidyCrossRefResponse, error) {
log.Printf("Cross referencing data for title: %s\n", title)

client := &http.Client{}

var response *http.Response
var err error

url := "https://api.crossref.org/works?query.bibliographic=" + title + "&rows=1&offset=0"

// escape the url
url = strings.ReplaceAll(url, " ", "%20")

//log.Printf("Crossref URL: %s\n", url)
response, err = client.Get(url)
if err != nil {
return nil, err
}

defer func(Body io.ReadCloser) {
err := Body.Close()
if err != nil {
fmt.Println("Error closing Crossref response body:", err)
}
}(response.Body)

// Parse JSON response
var crossRefResponse CrossRefTitleResponse
err = json.NewDecoder(response.Body).Decode(&crossRefResponse)
if err != nil {
return nil, err
}

if abstract != "" {
abstract = strings.TrimSpace(abstract)
re := regexp.MustCompile(`\s+`)
abstract = re.ReplaceAllString(abstract, " ")
if len(crossRefResponse.Message.Items) == 0 {
return nil, fmt.Errorf("No matching items found for title: %s", title)
}

return &TidyCrossRefResponse{
Title: crudeResponse.Title,
Year: crudeResponse.Year,
Abstract: abstract,
// Extract data from the response
item := crossRefResponse.Message.Items[0]
tidyResponse := &TidyCrossRefResponse{
Title: item.Title[0],
Year: fmt.Sprintf("%d", item.Issued.DateParts[0][0]), // assuming the date-parts contain the year
Abstract: item.Abstract,
DOI: item.DOI,
ISSN: item.ISSN[0],
}

return tidyResponse, nil
}
99 changes: 99 additions & 0 deletions internal/parsing/crossref_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package parsing

import (
"testing"
)

type CrossRefResponse struct {
Title string
Year string
DOI string
Abstract string
ISSN string
}

func checkField(t *testing.T, fieldName, expected, actual string) {
if expected == actual {
t.Logf("%s is correct", fieldName)
} else {
t.Errorf("%s is incorrect", fieldName)
}
}

func TestCrossRefDataDOI(t *testing.T) {
expectedResponse := CrossRefResponse{
Title: "Tebuconazole alters morphological, behavioral and neurochemical parameters in larvae and adult zebrafish (Danio rerio)",
Year: "2017",
DOI: "10.1016/j.chemosphere.2017.04.029",
Abstract: "",
ISSN: "0045-6535",
}

response, err := CrossRefDataDOI(expectedResponse.DOI)

if err != nil {
t.Error(err)
}

checkField(t, "Title", expectedResponse.Title, response.Title)
checkField(t, "Year", expectedResponse.Year, response.Year)
checkField(t, "DOI", expectedResponse.DOI, response.DOI)
checkField(t, "Abstract", expectedResponse.Abstract, response.Abstract)
checkField(t, "ISSN", expectedResponse.ISSN, response.ISSN)
}

func TestCrossRefDataTitle(t *testing.T) {
expectedResponse := CrossRefResponse{
Title: "Tebuconazole alters morphological, behavioral and neurochemical parameters in larvae and adult zebrafish (Danio rerio)",
Year: "2017",
DOI: "10.1016/j.chemosphere.2017.04.029",
Abstract: "",
ISSN: "0045-6535",
}

response, err := CrossRefDataTitle(expectedResponse.Title)

if err != nil {
t.Error(err)
}

checkField(t, "Title", expectedResponse.Title, response.Title)
checkField(t, "Year", expectedResponse.Year, response.Year)
checkField(t, "DOI", expectedResponse.DOI, response.DOI)
checkField(t, "Abstract", expectedResponse.Abstract, response.Abstract)
checkField(t, "ISSN", expectedResponse.ISSN, response.ISSN)
}

func TestCrossRefData2(t *testing.T) {
expectedResponse := CrossRefResponse{
Title: "The toxic effects of deltamethrin on Danio rerio: the correlation among behavior response, physiological damage and AChE",
Year: "2016",
DOI: "10.1039/c6ra23990k",
Abstract: "<p>In this work we comprehensively evaluated the effects of deltamethrin, a pyrethroid pesticide, on the behavior, physiology and acetylcholinesterase (AChE) activity of fish.</p>",
ISSN: "2046-2069",
}

response, err := CrossRefDataTitle(expectedResponse.Title)

if err != nil {
t.Error(err)
}

checkField(t, "Title", expectedResponse.Title, response.Title)
checkField(t, "Year", expectedResponse.Year, response.Year)
checkField(t, "DOI", expectedResponse.DOI, response.DOI)
checkField(t, "Abstract", expectedResponse.Abstract, response.Abstract)
checkField(t, "ISSN", expectedResponse.ISSN, response.ISSN)

response, err = CrossRefDataDOI(expectedResponse.DOI)

if err != nil {
t.Error(err)
}

checkField(t, "Title", expectedResponse.Title, response.Title)
checkField(t, "Year", expectedResponse.Year, response.Year)
checkField(t, "DOI", expectedResponse.DOI, response.DOI)
checkField(t, "Abstract", expectedResponse.Abstract, response.Abstract)
checkField(t, "ISSN", expectedResponse.ISSN, response.ISSN)
}
1 change: 1 addition & 0 deletions internal/parsing/grobid.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ type TidyGrobidResponse struct {
Authors []AuthorsRaw `json:"authors"`
Journal string `json:"journal"`
Notes string `json:"notes"`
ISSN string `json:"issn"`
}

type IdnosRaw struct {
Expand Down
9 changes: 9 additions & 0 deletions internal/parsing/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ func CreatePDFDTO(tidyGrobidResponse *TidyGrobidResponse, tidyCrossRefResponse *
log.Println("Using crossref year")
tidyGrobidResponse.Year = tidyCrossRefResponse.Year
}
if tidyCrossRefResponse.ISSN != "" {
log.Println("Using crossref ISSN")
tidyGrobidResponse.ISSN = tidyCrossRefResponse.ISSN
}
if tidyCrossRefResponse.DOI != "" {
log.Println("Using crossref DOI")
tidyGrobidResponse.Doi = tidyCrossRefResponse.DOI
}
}

// trim title and replace '-' with ' '
Expand All @@ -50,6 +58,7 @@ func CreatePDFDTO(tidyGrobidResponse *TidyGrobidResponse, tidyCrossRefResponse *
return &PDFDTO{
Title: tidyGrobidResponse.Title,
DOI: tidyGrobidResponse.Doi,
ISSN: tidyGrobidResponse.ISSN,
Abstract: tidyGrobidResponse.Abstract,
Sections: tidyGrobidResponse.Sections,
Keywords: tidyGrobidResponse.Keywords,
Expand Down
2 changes: 1 addition & 1 deletion internal/store/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ func (store *Store) CreatePaper(dto *parsing.PDFDTO, userID int64, screenID int6
slug := helpers.GenerateRandomString(14)

// create paper
_, err := store.db.Exec("INSERT INTO papers (slug, user_id, screen_id, title, abstract, year, doi, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", slug, userID, screenID, dto.Title, dto.Abstract, dto.Year, dto.DOI, carbon.Now().DateTimeString(), carbon.Now().DateTimeString())
_, err := store.db.Exec("INSERT INTO papers (slug, user_id, screen_id, title, issn, abstract, year, doi, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", slug, userID, screenID, dto.Title, dto.ISSN, dto.Abstract, dto.Year, dto.DOI, carbon.Now().DateTimeString(), carbon.Now().DateTimeString())
if err != nil {
return Paper{}, err
}
Expand Down

0 comments on commit e00533d

Please sign in to comment.