Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds openalex client as a default client #555

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions paperqa/clients/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from .client_models import MetadataPostProcessor, MetadataProvider
from .crossref import CrossrefProvider
from .journal_quality import JournalQualityPostProcessor
from .openalex import OpenAlexProvider
from .retractions import RetractionDataPostProcessor
from .semantic_scholar import SemanticScholarProvider
from .unpaywall import UnpaywallProvider
Expand All @@ -28,6 +29,7 @@

ALL_CLIENTS: Collection[type[MetadataPostProcessor | MetadataProvider]] = {
*DEFAULT_CLIENTS,
OpenAlexProvider,
UnpaywallProvider,
RetractionDataPostProcessor,
}
Expand Down
30 changes: 1 addition & 29 deletions paperqa/clients/crossref.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@

from .client_models import DOIOrTitleBasedProvider, DOIQuery, TitleAuthorQuery
from .exceptions import DOINotFoundError, make_flaky_ssl_error_predicate
from .shared_dicts import BIBTEX_MAPPING as CROSSREF_CONTENT_TYPE_TO_BIBTEX_MAPPING

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -71,35 +72,6 @@
"source_quality": {"container-title"},
"doc_id": {"DOI"},
}
CROSSREF_CONTENT_TYPE_TO_BIBTEX_MAPPING: dict[str, str] = {
nadolskit marked this conversation as resolved.
Show resolved Hide resolved
"journal-article": "article",
"journal-issue": "misc", # No direct equivalent, so 'misc' is used
"journal-volume": "misc", # No direct equivalent, so 'misc' is used
"journal": "misc", # No direct equivalent, so 'misc' is used
"proceedings-article": "inproceedings",
"proceedings": "proceedings",
"dataset": "misc", # No direct equivalent, so 'misc' is used
"component": "misc", # No direct equivalent, so 'misc' is used
"report": "techreport",
"report-series": "techreport", # 'series' implies multiple tech reports, but each is still a 'techreport'
"standard": "misc", # No direct equivalent, so 'misc' is used
"standard-series": "misc", # No direct equivalent, so 'misc' is used
"edited-book": "book", # Edited books are considered books in BibTeX
"monograph": "book", # Monographs are considered books in BibTeX
"reference-book": "book", # Reference books are considered books in BibTeX
"book": "book",
"book-series": "book", # Series of books can be considered as 'book' in BibTeX
"book-set": "book", # Set of books can be considered as 'book' in BibTeX
"book-chapter": "inbook",
"book-section": "inbook", # Sections in books can be considered as 'inbook'
"book-part": "inbook", # Parts of books can be considered as 'inbook'
"book-track": "inbook", # Tracks in books can be considered as 'inbook'
"reference-entry": "inbook", # Entries in reference books can be considered as 'inbook'
"dissertation": "phdthesis", # Dissertations are usually PhD thesis
"posted-content": "misc", # No direct equivalent, so 'misc' is used
"peer-review": "misc", # No direct equivalent, so 'misc' is used
"other": "article", # Assume an article if we don't know the type
}

_ISSUED_WARNINGS = [False, False] # 0 is API key, 1 is email

Expand Down
238 changes: 238 additions & 0 deletions paperqa/clients/openalex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
from __future__ import annotations

import json
import logging
import os
import re
from collections.abc import Collection
from datetime import datetime
from typing import Any
from urllib.parse import quote

import aiohttp

from paperqa.types import DocDetails
from paperqa.utils import convert_acutes, strings_similarity

from .client_models import DOIOrTitleBasedProvider, DOIQuery, TitleAuthorQuery
from .exceptions import DOINotFoundError
from .shared_dicts import BIBTEX_MAPPING

OPENALEX_BASE_URL = "https://api.openalex.org"
OPENALEX_API_REQUEST_TIMEOUT = 5.0
logger = logging.getLogger(__name__)
nadolskit marked this conversation as resolved.
Show resolved Hide resolved


async def get_openalex_mailto() -> str | None:
"""Get the OpenAlex mailto address.
Returns:
The OpenAlex mailto address if available.
"""
mailto_address = os.environ.get("OPENALEX_MAILTO")
if mailto_address is None:
logger.warning(
"OPENALEX_MAILTO environment variable not set."
" your request may be deprioritized by OpenAlex."
)
return os.environ.get("OPENALEX_MAILTO")


async def get_doc_details_from_openalex(
session: aiohttp.ClientSession,
doi: str | None = None,
title: str | None = None,
fields: Collection[str] | None = None,
title_similarity_threshold: float = 0.75,
) -> DocDetails | None:
"""Get paper details from OpenAlex given a DOI or paper title.
Args:
session: The active session of the request.
doi: The DOI of the paper.
title: The title of the paper.
fields: Specific fields to include in the request.
title_similarity_threshold: The threshold for title similarity.
Returns:
The details of the document if found, otherwise None.
Raises:
ValueError: If neither DOI nor title is provided.
DOINotFoundError: If the paper cannot be found.
"""
mailto = await get_openalex_mailto()
params = {"mailto": mailto} if mailto else {}

if doi is None and title is None:
raise ValueError("Either a DOI or title must be provided.")

url = f"{OPENALEX_BASE_URL}/works"
if doi:
url += f"/https://doi.org/{quote(doi, safe='')}"
nadolskit marked this conversation as resolved.
Show resolved Hide resolved
elif title:
params["filter"] = f"title.search:{title}"

if fields:
params["select"] = ",".join(fields)

async with session.get(
url,
params=params,
timeout=aiohttp.ClientTimeout(OPENALEX_API_REQUEST_TIMEOUT),
) as response:
try:
response.raise_for_status()
response_data = await response.json()
except (aiohttp.ClientResponseError, json.JSONDecodeError) as exc:
raise DOINotFoundError("Could not find paper given DOI/title.") from exc

if response_data.get("status") == "failed":
raise DOINotFoundError(
"OpenAlex API returned a failed status for the query."
)

results_data = response_data
if params.get("filter") is not None:
results_data = results_data["results"]
if len(results_data) == 0:
raise DOINotFoundError(
"OpenAlex API did not return any items for the query."
)
results_data = results_data[0]

if (
doi is None
and title
and strings_similarity(results_data.get("title", ""), title)
< title_similarity_threshold
):
raise DOINotFoundError(
f"OpenAlex results did not match for title {title!r}."
)
if doi and results_data.get("doi") != doi:
raise DOINotFoundError("DOI not found in OpenAlex")
nadolskit marked this conversation as resolved.
Show resolved Hide resolved

return await parse_openalex_to_doc_details(results_data)


async def parse_openalex_to_doc_details(message: dict[str, Any]) -> DocDetails:
"""Parse OpenAlex API response to DocDetails.
Args:
message: The OpenAlex API response message.
Returns:
Parsed document details.
"""

# author_name will be FamilyName, GivenName Middle initial. (if available)
# there is no standalone "FamilyName" or "GivenName" fields
# this manually constructs the name into the format the other clients use
def reformat_name(name: str) -> str:
# https://regex101.com/r/74vR57/1
pattern = r"^([^,]+),\s*(.+?)(?:\s+(\w+\.?))?$"
match = re.match(pattern, name)
if match:
family_name, given_name, middle = match.groups()
nadolskit marked this conversation as resolved.
Show resolved Hide resolved

family_name = family_name.strip()
given_name = given_name.strip()
nadolskit marked this conversation as resolved.
Show resolved Hide resolved

reformatted = f"{given_name}"
nadolskit marked this conversation as resolved.
Show resolved Hide resolved
if middle:
reformatted += f" {middle.strip()}"
reformatted += f" {family_name}"
return reformatted.strip()

return name

authors = [
authorship.get("raw_author_name")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This .get if it returns None, won't the reformat_name blow up?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you're right -- I was thinking this could never be None so I didn't put much thought into it originally. but it's safer regardless.

Added some safeguards. lmk

for authorship in message.get("authorships", [])
]
authors = [reformat_name(author) for author in authors]
sanitized_authors = [convert_acutes(author) for author in authors]

publisher = (
message.get("primary_location", {})
.get("source", {})
.get("host_organization_name")
)
journal = message.get("primary_location", {}).get("source", {}).get("display_name")

return DocDetails( # type: ignore[call-arg]
key=None,
bibtex_type=BIBTEX_MAPPING.get(message.get("type", "other"), "misc"),
bibtex=None,
authors=sanitized_authors,
publication_date=datetime.fromisoformat(message.get("publication_date", "")),
year=message.get("publication_year"),
volume=message.get("biblio", {}).get("volume"),
issue=message.get("biblio", {}).get("issue"),
publisher=publisher,
issn=message.get("primary_location", {}).get("source", {}).get("issn_l"),
pages=message.get("biblio", {}).get("last_page"),
journal=journal,
url=message.get("doi"),
title=message.get("title"),
citation_count=message.get("cited_by_count"),
doi=message.get("doi"),
other=message,
)
nadolskit marked this conversation as resolved.
Show resolved Hide resolved


class OpenAlexProvider(DOIOrTitleBasedProvider):
async def get_doc_details(
self, doi: str, session: aiohttp.ClientSession
) -> DocDetails | None:
"""Get document details by DOI.
Args:
doi: The DOI of the document.
session: The active session of the request.
Returns:
The document details if found, otherwise None.
"""
return await get_doc_details_from_openalex(doi=doi, session=session)
jamesbraza marked this conversation as resolved.
Show resolved Hide resolved

async def search_by_title(
self,
query: str,
session: aiohttp.ClientSession,
title_similarity_threshold: float = 0.75,
) -> DocDetails | None:
"""Search for document details by title.
Args:
query: The title query for the document.
session: The active session of the request.
title_similarity_threshold: Threshold for title similarity.
Returns:
The document details if found, otherwise None.
"""
return await get_doc_details_from_openalex(
title=query,
session=session,
title_similarity_threshold=title_similarity_threshold,
)

async def _query(self, query: TitleAuthorQuery | DOIQuery) -> DocDetails | None:
"""Query the OpenAlex API via the provided DOI or title.
Args:
query: The query containing either a DOI or title.
DOI is prioritized over title.
Returns:
The document details if found, otherwise None.
"""
if isinstance(query, DOIQuery):
return await self.get_doc_details(doi=query.doi, session=query.session)
return await self.search_by_title(
query=query.title,
session=query.session,
title_similarity_threshold=query.title_similarity_threshold,
)
29 changes: 29 additions & 0 deletions paperqa/clients/shared_dicts.py
nadolskit marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
BIBTEX_MAPPING: dict[str, str] = {
"journal-article": "article",
"journal-issue": "misc", # No direct equivalent, so 'misc' is used
"journal-volume": "misc", # No direct equivalent, so 'misc' is used
"journal": "misc", # No direct equivalent, so 'misc' is used
"proceedings-article": "inproceedings",
"proceedings": "proceedings",
"dataset": "misc", # No direct equivalent, so 'misc' is used
"component": "misc", # No direct equivalent, so 'misc' is used
"report": "techreport",
"report-series": "techreport", # 'series' implies multiple tech reports, but each is still a 'techreport'
"standard": "misc", # No direct equivalent, so 'misc' is used
"standard-series": "misc", # No direct equivalent, so 'misc' is used
"edited-book": "book", # Edited books are considered books in BibTeX
"monograph": "book", # Monographs are considered books in BibTeX
"reference-book": "book", # Reference books are considered books in BibTeX
"book": "book",
"book-series": "book", # Series of books can be considered as 'book' in BibTeX
"book-set": "book", # Set of books can be considered as 'book' in BibTeX
"book-chapter": "inbook",
"book-section": "inbook", # Sections in books can be considered as 'inbook'
"book-part": "inbook", # Parts of books can be considered as 'inbook'
"book-track": "inbook", # Tracks in books can be considered as 'inbook'
"reference-entry": "inbook", # Entries in reference books can be considered as 'inbook'
"dissertation": "phdthesis", # Dissertations are usually PhD thesis
"posted-content": "misc", # No direct equivalent, so 'misc' is used
"peer-review": "misc", # No direct equivalent, so 'misc' is used
"other": "article", # Assume an article if we don't know the type
}
4 changes: 3 additions & 1 deletion paperqa/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,9 @@ class DocDetails(Doc):

citation: str
key: str | None = None
bibtex: str | None = None
bibtex: str | None = (
None # this will be autogenerated from other represented fields
)
nadolskit marked this conversation as resolved.
Show resolved Hide resolved
authors: list[str] | None = None
publication_date: datetime | None = None
year: int | None = None
Expand Down
22 changes: 21 additions & 1 deletion paperqa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import os
import re
import string
import unicodedata
from collections.abc import Collection, Coroutine, Iterable, Iterator
from datetime import datetime
from functools import reduce
Expand Down Expand Up @@ -345,6 +346,24 @@ def remove_substrings(target: str, substr_removal_list: Collection[str]) -> str:
return target


def convert_acutes(text: str) -> str:
nadolskit marked this conversation as resolved.
Show resolved Hide resolved
"""Replaces acute accent with apostrophe."""

# Used for any client that returns names with acutes
def replace_acute(match):
return f"'{match.group(1)}"

nfd = unicodedata.normalize("NFD", text)
converted = re.sub(r"([aeiouAEIOU])\u0301", replace_acute, nfd)
return unicodedata.normalize("NFC", converted)


def remove_acutes(text: str) -> str:
return "".join(
c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn"
)


def bibtex_field_extract(
bibtex: str, field: str, missing_replacements: dict[str, str] | None = None
) -> str:
Expand Down Expand Up @@ -372,7 +391,8 @@ def create_bibtex_key(author: list[str], year: str, title: str) -> str:
FORBIDDEN_KEY_CHARACTERS = {"_", " ", "-", "/", "'", "`", ":", ",", "\n"}
try:
author_rep = (
author[0].split()[-1].casefold()
# casefold will not remove accutes
remove_acutes(author[0].split()[-1].casefold())
if "Unknown" not in author[0]
else UNKNOWN_AUTHOR_KEY
)
Expand Down
Loading
Loading