diff --git a/.gitignore b/.gitignore index 949696c..5856d23 100644 --- a/.gitignore +++ b/.gitignore @@ -144,4 +144,5 @@ dmypy.json # Pyre type checker .pyre/ -*.tar.gz \ No newline at end of file +*.tar.gz +*.zip \ No newline at end of file diff --git a/environment.yml b/environment.yml index 65dccc5..d8a3757 100644 --- a/environment.yml +++ b/environment.yml @@ -40,6 +40,8 @@ dependencies: - libapr=1.7.0=hf178f73_5 - libapriconv=1.2.2=h7f8727e_5 - libaprutil=1.6.1=hfefca11_5 + - libblas=3.9.0=12_linux64_mkl + - libcblas=3.9.0=12_linux64_mkl - libdb=6.2.32=hf484d3e_0 - libedit=3.1.20210910=h7f8727e_0 - libffi=3.3=he6710b0_2 @@ -47,6 +49,7 @@ dependencies: - libgomp=9.3.0=h5101ec6_17 - libiconv=1.15=h63c8f33_5 - libidn2=2.3.2=h7f8727e_0 + - liblapack=3.9.0=12_linux64_mkl - libpng=1.6.37=hbc83047_0 - libstdcxx-ng=9.3.0=hd4cf53a_17 - libtasn1=4.16.0=h27cfd23_0 @@ -119,20 +122,25 @@ dependencies: - absl-py==1.0.0 - astunparse==1.6.3 - beir==0.2.3 + - black==23.1.0 - blis==0.7.5 - cachetools==5.0.0 - catalogue==2.0.6 - charset-normalizer==2.0.10 - click==8.0.3 + - contourpy==1.0.7 - crash-ipdb==0.0.3 + - cycler==0.11.0 - cymem==2.0.6 - cython==0.29.26 - datasets==1.1.3 - dill==0.3.4 - elasticsearch==7.16.3 + - exceptiongroup==1.1.1 - faiss-cpu==1.7.2 - filelock==3.4.2 - flatbuffers==2.0 + - fonttools==4.38.0 - gast==0.4.0 - google-auth==2.5.0 - google-auth-oauthlib==0.4.6 @@ -142,27 +150,36 @@ dependencies: - huggingface-hub==0.4.0 - idna==3.3 - importlib-metadata==4.10.1 + - importlib-resources==5.10.2 + - iniconfig==2.0.0 - ipdb==0.13.9 - jinja2==3.0.3 - joblib==1.1.0 - keras==2.7.0 - keras-preprocessing==1.1.2 + - kiwisolver==1.4.4 - langcodes==3.3.0 - libclang==12.0.0 - lightgbm==3.3.2 - markdown==3.3.6 - markupsafe==2.0.1 + - matplotlib==3.7.0 - multiprocess==0.70.12.2 - murmurhash==1.0.6 + - mypy-extensions==1.0.0 - nltk==3.6.7 - nmslib==2.1.1 - numpy==1.22.1 - oauthlib==3.1.1 - onnxruntime==1.10.0 - opt-einsum==3.3.0 - - packaging==21.3 + - packaging==23.0 - pandas==1.4.0 + - pathspec==0.11.0 - pathy==0.6.1 + - patsy==0.5.3 + - platformdirs==3.0.0 + - pluggy==1.0.0 - preshed==3.0.6 - protobuf==3.19.3 - psutil==5.9.0 @@ -173,7 +190,8 @@ dependencies: - pydantic==1.8.2 - pyjnius==1.4.1 - pyparsing==3.0.7 - - pyserini==0.15.0 + - pyserini==0.20.0 + - pytest==7.3.0 - python-dateutil==2.8.2 - pytrec-eval==0.5 - pytz==2021.3 @@ -192,6 +210,7 @@ dependencies: - spacy-legacy==3.0.8 - spacy-loggers==1.0.1 - srsly==2.4.2 + - statsmodels==0.13.5 - tensorboard==2.8.0 - tensorboard-data-server==0.6.1 - tensorboard-plugin-wit==1.8.1 @@ -205,8 +224,9 @@ dependencies: - threadpoolctl==3.0.0 - tokenizers==0.10.3 - toml==0.10.2 + - tomli==2.0.1 - torch-scatter==2.0.6 - - tqdm==4.49.0 + - tqdm==4.64.1 - transformers==4.15.0 - typer==0.4.0 - urllib3==1.26.8 @@ -215,4 +235,4 @@ dependencies: - wrapt==1.13.3 - xxhash==2.0.2 - zipp==3.7.0 -prefix: /home/n3thakur/anaconda3/envs/sparse-retrieval +prefix: /home/fb20user07/miniconda3/envs/sparse-retrieval diff --git a/sample-data/build.py b/sample-data/build.py new file mode 100644 index 0000000..dc22a4e --- /dev/null +++ b/sample-data/build.py @@ -0,0 +1,58 @@ +import json +import shutil +from beir import util, LoggingHandler +from beir.retrieval import models +from beir.datasets.data_loader import GenericDataLoader +from beir.retrieval.evaluation import EvaluateRetrieval +from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES + +import logging +import pathlib, os +import random + +#### Just some code to print debug information to stdout +logging.basicConfig( + format="%(asctime)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + level=logging.INFO, + handlers=[LoggingHandler()], +) +#### /print debug information to stdout + +#### Download scifact.zip dataset and unzip the dataset +dataset = "scifact" +url = ( + "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format( + dataset + ) +) +out_dir = os.path.join(pathlib.Path(__file__).parent.absolute(), ".") +shutil.rmtree(os.path.join(out_dir, "scifact")) +data_path = util.download_and_unzip(url, out_dir) + +corpus, queries, qrels = GenericDataLoader(data_folder=data_path).load(split="test") +random_state = random.Random(42) +corpus_sampled = dict(random_state.sample(list(corpus.items()), k=10)) +qrels_sampled = dict(random_state.sample(list(qrels.items()), k=3)) +for qid, rels in qrels_sampled.items(): + for did in rels: + corpus_sampled[did] = corpus[did] +queries_sampled = {qid: queries[qid] for qid, _ in qrels_sampled.items()} + + +with open(os.path.join(data_path, "corpus.jsonl"), "w") as f: + for id, line in corpus_sampled.items(): + line.update({"_id": id}) + f.write(json.dumps(line) + "\n") + +with open(os.path.join(data_path, "queries.jsonl"), "w") as f: + for qid, text in queries_sampled.items(): + f.write(json.dumps({"_id": qid, "text": text, "metadata": {}}) + "\n") + +with open(os.path.join(data_path, "qrels", "test.tsv"), "w") as f: + f.write("query-id\tcorpus-id\tscore\n") + for qid, rels in qrels_sampled.items(): + for did, rel in rels.items(): + f.write(f"{qid}\t{did}\t{rel}\n") + +os.remove(os.path.join(data_path, "qrels", "train.tsv")) diff --git a/sample-data/scifact/corpus.jsonl b/sample-data/scifact/corpus.jsonl new file mode 100644 index 0000000..6bb8443 --- /dev/null +++ b/sample-data/scifact/corpus.jsonl @@ -0,0 +1,13 @@ +{"text": "OBJECTIVES To examine the effect of routinely administered psychiatric questionnaires on the recognition, management, and outcome of psychiatric disorders in non-psychiatric settings. DATA SOURCES Embase, Medline, PsycLIT, Cinahl, Cochrane Controlled Trials Register, and hand searches of key journals. METHODS A systematic review of randomised controlled trials of the administration and routine feedback of psychiatric screening and outcome questionnaires to clinicians in non-psychiatric settings. Narrative overview of key design features and end points, together with a random effects quantitative synthesis of comparable studies. MAIN OUTCOME MEASURES Recognition of psychiatric disorders after feedback of questionnaire results; interventions for psychiatric disorders; and outcome of psychiatric disorders. RESULTS Nine randomised studies were identified that examined the use of common psychiatric instruments in primary care and general hospital settings. Studies compared the effect of the administration of these instruments followed by the feedback of the results to clinicians, with administration with no feedback. Meta-analytic pooling was possible for four of these studies (2457 participants), which measured the effect of feedback on the recognition of depressive disorders. Routine administration and feedback of scores for all patients (irrespective of score) did not increase the overall rate of recognition of mental disorders such as anxiety and depression (relative risk of detection of depression by clinician after feedback 0.95, 95% confidence interval 0.83 to 1.09). Two studies showed that routine administration followed by selective feedback for only high scorers increased the rate of recognition of depression (relative risk of detection of depression after feedback 2.64, 1.62 to 4.31). This increased recognition, however, did not translate into an increased rate of intervention. Overall, studies of routine administration of psychiatric measures did not show an effect on patient outcome. CONCLUSIONS The routine measurement of outcome is a costly exercise. Little evidence shows that it is of benefit in improving psychosocial outcomes of those with psychiatric disorder managed in non-psychiatric settings.", "title": "Anxiety", "_id": "4695046"} +{"text": "The importance of genetic factors in etiology of chronic lymphocytic leukemia (CLL) is suggested by family and population studies. However, the spectrum of malignancies sharing common genetic factors with CLL and the effects of sex and age on familial risk are unknown. We used the Swedish Family-Cancer Database to test for increased familial risks of CLL and other lymphoproliferative tumors. Cancer diagnoses from 1958 to 1998 were assessed in 14 336 first-degree relatives of 5918 CLL cases and in 28 876 first-degree relatives of 11 778 controls. Cancer risks in relatives of cases were compared with those in relatives of controls using marginal survival models. Relatives of cases were at significantly increased risk for CLL (relative risk [RR] = 7.52; 95% confidence interval [CI], 3.63-15.56), for non-Hodgkin lymphoma (RR = 1.45; 95% CI, 0.98-2.16), and for Hodgkin lymphoma (RR = 2.35; 95% CI, 1.08-5.08). CLL risks were similar in parents, siblings, and offspring of cases, in male and female relatives, and were not affected by the case's age at diagnosis. Anticipation was not significant when analyzed using life table methods. We conclude that the familial component of CLL is shared with other lymphoproliferative malignances, suggesting common genetic pathways. However, because clinically diagnosed CLL is uncommon, absolute excess risk to relatives is small.", "title": "Familial risk of lymphoproliferative tumors in families of patients with chronic lymphocytic leukemia: results from the Swedish Family-Cancer Database.", "_id": "1153655"} +{"text": "Lipid droplets are ubiquitous triglyceride and sterol ester storage organelles required for energy storage homeostasis and biosynthesis. Although little is known about lipid droplet formation and regulation, it is clear that members of the PAT (perilipin, adipocyte differentiation related protein, tail interacting protein of 47 kDa) protein family coat the droplet surface and mediate interactions with lipases that remobilize the stored lipids. We identified key Drosophila candidate genes for lipid droplet regulation by RNA interference (RNAi) screening with an image segmentation-based optical read-out system, and show that these regulatory functions are conserved in the mouse. Those include the vesicle-mediated Coat Protein Complex I (COPI) transport complex, which is required for limiting lipid storage. We found that COPI components regulate the PAT protein composition at the lipid droplet surface, and promote the association of adipocyte triglyceride lipase (ATGL) with the lipid droplet surface to mediate lipolysis. Two compounds known to inhibit COPI function, Exo1 and Brefeldin A, phenocopy COPI knockdowns. Furthermore, RNAi inhibition of ATGL and simultaneous drug treatment indicate that COPI and ATGL function in the same pathway. These data indicate that the COPI complex is an evolutionarily conserved regulator of lipid homeostasis, and highlight an interaction between vesicle transport systems and lipid droplets.", "title": "COPI Complex Is a Regulator of Lipid Homeostasis", "_id": "13794374"} +{"text": "This study investigated whether there are race differences in the structure of informal caregiving networks. Data on 3,793 functionally impaired persons age 65 and over from the 1989 National Long-Term Care Survey were analyzed. The size of the total caregiver network and the unpaid network did not differ by race, but the likelihood of there being a non-immediate family member among unpaid caregivers was higher among disabled older blacks. These findings raise questions about whether race differences in nursing home utilization and paid long-term care services, documented in other studies, can be explained by differences in caregiving arrangements.", "title": "The structure of informal care: are there differences by race?", "_id": "12087063"} +{"text": "BACKGROUND The purpose of this study was to test the hypothesis that vasodilator responses of porcine coronary resistance arteries are increased by exercise training. METHODS AND RESULTS Yucatan miniature swine were randomly divided into groups of exercise-trained (ET) and sedentary (SED) control pigs. ET pigs were placed on a progressive treadmill training program lasting 16 to 20 weeks, and SED pigs remained inactive during the same time period. Coronary resistance arteries 64 to 157 microns in diameter were isolated for in vitro evaluation of relaxation responses to the endothelium-independent dilators sodium nitroprusside (1 x 10(-10) to 1 x 10(-4) mol/L) and adenosine (1 x 10(-10) to 1 x 10(-5) mol/L) and to bradykinin (1 x 10(-13) to 3 x 10(-7) mol/L), an endothelium-dependent agent. Relaxation responses to adenosine and sodium nitroprusside were not altered by exercise training. Endothelium-dependent relaxation to bradykinin was enhanced in coronary resistance arteries from ET pigs (IC50: ET, 0.07 +/- 0.02 nmol/L; SED, 1.59 +/- 0.09 nmol/L). To determine whether prostanoids and/or the nitric oxide synthase pathway were involved in the ET-induced changes in bradykinin-induced vasodilation, responses to bradykinin were examined in coronary resistance arteries from both ET and SED pigs in the presence of indomethacin and in the presence of nitro-monomethyl L-arginine (L-NMMA). Both indomethacin and L-NMMA produced significant inhibition of the bradykinin-induced relaxation in vessels from both groups. Despite decreased bradykinin-induced relaxation after indomethacin, bradykinin-induced vasodilation was still enhanced in vessels from the ET group. L-NMMA caused greater inhibition of the bradykinin-induced relaxation in coronary resistance arteries from ET pigs relative to arteries from SED pigs and eliminated the training-induced enhancement of the bradykinin responses. CONCLUSIONS These results suggest that exercise training enhances bradykinin-induced vasodilation through increased endothelium-derived relaxing factor/nitric oxide production by the L-arginine/nitric oxide synthase pathway.", "title": "Vasodilator responses of coronary resistance arteries of exercise-trained pigs.", "_id": "10697096"} +{"text": "BACKGROUND Influenza A virus (IAV) infection primarily targets respiratory epithelial cells and produces clinical outcomes ranging from mild upper respiratory infection to severe pneumonia. Recent studies have shown the importance of lung antioxidant defense systems against injury by IAV. Nuclear factor-erythroid 2 related factor 2 (Nrf2) activates the majority of antioxidant genes. METHODS Alveolar type II (ATII) cells and alveolar macrophages (AM) were isolated from human lungs not suitable for transplantation and donated for medical research. In some studies ATII cells were transdifferentiated to alveolar type I-like (ATI-like) cells. Alveolar epithelial cells were infected with A/PR/8/34 (PR8) virus. We analyzed PR8 virus production, influenza A nucleoprotein levels, ROS generation and expression of antiviral genes. Immunocytofluorescence was used to determine Nrf2 translocation and western blotting to detect Nrf2, HO-1 and caspase 1 and 3 cleavage. We also analyzed ingestion of PR8 virus infected apoptotic ATII cells by AM, cytokine levels by ELISA, glutathione levels, necrosis and apoptosis by TUNEL assay. Moreover, we determined the critical importance of Nrf2 using adenovirus Nrf2 (AdNrf2) or Nrf2 siRNA to overexpress or knockdown Nrf2, respectively. RESULTS We found that IAV induced oxidative stress, cytotoxicity and apoptosis in ATI-like and ATII cells. We also found that AM can ingest PR8 virus-induced apoptotic ATII cells (efferocytosis) but not viable cells, whereas ATII cells did not ingest these apoptotic cells. PR8 virus increased ROS production, Nrf2, HO-1, Mx1 and OAS1 expression and Nrf2 translocation to the nucleus. Nrf2 knockdown with siRNA sensitized ATI-like cells and ATII cells to injury induced by IAV and overexpression of Nrf2 with AdNrf2 protected these cells. Furthermore, Nrf2 overexpression followed by infection with PR8 virus decreased virus replication, influenza A nucleoprotein expression, antiviral response and oxidative stress. However, AdNrf2 did not increase IFN-\u03bb1 (IL-29) levels. CONCLUSIONS Our results indicate that IAV induces alveolar epithelial injury and that Nrf2 protects these cells from the cytopathic effects of IAV likely by increasing the expression of antioxidant genes. Identifying the pathways involved in protecting cells from injury during influenza infection may be particularly important for developing new therapeutic strategies.", "title": "Nrf2 protects human alveolar epithelial cells against injury induced by influenza A virus", "_id": "6182947"} +{"text": "Since it was discovered that the anti-hypertensive agent ifenprodil has neuroprotective activity through its effects on NMDA (N-methyl-D-aspartate) receptors, a determined effort has been made to understand the mechanism of action and to develop improved therapeutic compounds on the basis of this knowledge. Neurotransmission mediated by NMDA receptors is essential for basic brain development and function. These receptors form heteromeric ion channels and become activated after concurrent binding of glycine and glutamate to the GluN1 and GluN2 subunits, respectively. A functional hallmark of NMDA receptors is that their ion-channel activity is allosterically regulated by binding of small compounds to the amino-terminal domain (ATD) in a subtype-specific manner. Ifenprodil and related phenylethanolamine compounds, which specifically inhibit GluN1 and GluN2B NMDA receptors, have been intensely studied for their potential use in the treatment of various neurological disorders and diseases, including depression, Alzheimer's disease and Parkinson's disease. Despite considerable enthusiasm, mechanisms underlying the recognition of phenylethanolamines and ATD-mediated allosteric inhibition remain limited owing to a lack of structural information. Here we report that the GluN1 and GluN2B ATDs form a heterodimer and that phenylethanolamine binds at the interface between GluN1 and GluN2B, rather than within the GluN2B cleft. The crystal structure of the heterodimer formed between the GluN1b ATD from Xenopus laevis and the GluN2B ATD from Rattus norvegicus shows a highly distinct pattern of subunit arrangement that is different from the arrangements observed in homodimeric non-NMDA receptors and reveals the molecular determinants for phenylethanolamine binding. Restriction of domain movement in the bi-lobed structure of the GluN2B ATD, by engineering of an inter-subunit disulphide bond, markedly decreases sensitivity to ifenprodil, indicating that conformational freedom in the GluN2B ATD is essential for ifenprodil-mediated allosteric inhibition of NMDA receptors. These findings pave the way for improving the design of subtype-specific compounds with therapeutic value for neurological disorders and diseases.", "title": "Subunit Arrangement and Phenylethanolamine Binding in GluN1/GluN2B NMDA Receptors", "_id": "4425507"} +{"text": "PURPOSE To determine the toxicity and the therapeutic efficacy of the combination of the recombinant tumor necrosis factor alpha (rTNF alpha), recombinant interferon gamma (rIFN-gamma), and melphalan, we designed a protocol using isolation limb perfusion (ILP) with hyperthermia for in-transit metastases of melanoma and recurrent sarcoma. The triple combination was chosen because of the reported synergistic antitumor effect of rTNF alpha with IFN-gamma and of rTNF alpha with alkylating agents. PATIENTS AND METHODS Twenty-three patients received a total of 25 ILPs with the triple combination. There were 19 females and four males with either multiple progressive in-transit melanoma metastases of the extremities (stage IIIa or IIIab; 19 patients) or recurrent soft tissue sarcoma (five). The rTNF alpha was injected as a bolus in the arterial line, and total dose ranged between 2 and 4 mg, under hyperthermic conditions (40 degrees C to 40.5 degrees C) for 90 minutes. The rIFN-gamma was given subcutaneously (SC) on days -2 and -1 and in the perfusate, with rTNF alpha at the dose of 0.2 mg. Melphalan (Alkeran; Burroughs Wellcome Co, London, England) was administered in the perfusate at 40 micrograms/mL. RESULTS Toxicity observed during three ILPs in a pilot study with rTNF alpha included only two severe toxicities: one severe hypotension with tachycardia and transient oliguria and one moderate hypotension for 4 hours followed by severe kidney failure with complete recovery on day 29. In all 18 ILPs performed in the triple combination protocol, the patients received continuous infusion dopamine at 3 micrograms/kg/min from the start of ILP and for 72 hours and showed only mild hypotension and transient chills and temperature. Regional toxicity attributable to rTNF alpha was minimal. There have been 11 cases with hematologic toxicity consisting of neutropenia (one grade 4 and one grade 3) and neutropenia with thrombocytopenia (one grade 4 and three grade 2). Twelve patients had been previously treated with melphalan in ILP (11) or with cisplatin (one). The 23 patients are assessable: there have been 21 complete responses (CRs; range, 4 to 29 months; 89%), two partial responses (PRs; range, 2 to 3 months), and no failures. Overall disease-free survival and survival have been 70% and 76%, respectively, at 12 months. In all cases, softening of the nodules was obvious within 3 days after ILP and time to definite response ranged between day 5 and 30. CONCLUSION This preliminary analysis of a phase II study suggests that high-dose rTNF alpha can be administered with acceptable toxicity by ILP with dopamine and hyperhydration. Tumor responses can be evidenced in melanoma and sarcoma. Furthermore, combination of rTNF alpha, rIFN-gamma, and melphalan seems to achieve high efficacy with minimal toxicity, even after failure of prior therapy with melphalan alone.", "title": "High-dose recombinant tumor necrosis factor alpha in combination with interferon gamma and melphalan in isolation perfusion of the limbs for melanoma and sarcoma.", "_id": "35766603"} +{"text": "Aire-expressing medullary thymic epithelial cells (mTECs) play a key role in preventing autoimmunity by expressing tissue-restricted antigens to help purge the emerging T cell receptor repertoire of self-reactive specificities. Here we demonstrate a novel role for a CD4+3\u2212 inducer cell population, previously linked to development of organized secondary lymphoid structures and maintenance of T cell memory in the functional regulation of Aire-mediated promiscuous gene expression in the thymus. CD4+3\u2212 cells are closely associated with mTECs in adult thymus, and in fetal thymus their appearance is temporally linked with the appearance of Aire+ mTECs. We show that RANKL signals from this cell promote the maturation of RANK-expressing CD80\u2212Aire\u2212 mTEC progenitors into CD80+Aire+ mTECs, and that transplantation of RANK-deficient thymic stroma into immunodeficient hosts induces autoimmunity. Collectively, our data reveal cellular and molecular mechanisms leading to the generation of Aire+ mTECs and highlight a previously unrecognized role for CD4+3\u2212RANKL+ inducer cells in intrathymic self-tolerance.", "title": "RANK signals from CD4+3\u2212 inducer cells regulate development of Aire-expressing epithelial cells in the thymic medulla", "_id": "3952288"} +{"text": "Various proteins have been found to play roles in both the repair of UV damaged DNA and heterochromatin-mediated silencing in the yeast Saccharomyces cerevisiae. In particular, factors that are involved in the methylation of lysine-79 of histone H3 by Dot1p have been implicated in both processes, suggesting a bipartite function for this modification. We find that a dot1 null mutation and a histone H3 point mutation at lysine-79 cause increased sensitivity to UV radiation, suggesting that lysine-79 methylation is important for efficient repair of UV damage. Epistasis analysis between dot1 and various UV repair genes indicates that lysine-79 methylation plays overlapping roles within the nucleotide excision, post-replication and recombination repair pathways, as well as RAD9-mediated checkpoint function. In contrast, epistasis analysis with the H3 lysine-79 point mutation indicates that the lysine-to-glutamic acid substitution exerts specific effects within the nucleotide excision repair and post-replication repair pathways, suggesting that this allele only disrupts a subset of the functions of lysine-79 methylation. The overall results indicate the existence of distinct and separable roles of histone H3 lysine-79 methylation in the response to UV damage, potentially serving to coordinate the various repair processes.", "title": "Methylation of histone H3 lysine-79 by Dot1p plays multiple roles in the response to UV damage in Saccharomyces cerevisiae.", "_id": "42267740"} +{"text": "Two-component signal transduction pathways comprising histidine protein kinases (HPKs) and their response regulators (RRs) are widely used to control bacterial responses to environmental challenges. Some bacteria have over 150 different two-component pathways, and the specificity of the phosphotransfer reactions within these systems is tightly controlled to prevent unwanted crosstalk. One of the best understood two-component signalling pathways is the chemotaxis pathway. Here, we present the 1.40 A crystal structure of the histidine-containing phosphotransfer domain of the chemotaxis HPK, CheA(3), in complex with its cognate RR, CheY(6). A methionine finger on CheY(6) that nestles in a hydrophobic pocket in CheA(3) was shown to be important for the interaction and was found to only occur in the cognate RRs of CheA(3), CheY(6), and CheB(2). Site-directed mutagenesis of this methionine in combination with two adjacent residues abolished binding, as shown by surface plasmon resonance studies, and phosphotransfer from CheA(3)-P to CheY(6). Introduction of this methionine and an adjacent alanine residue into a range of noncognate CheYs, dramatically changed their specificity, allowing protein interaction and rapid phosphotransfer from CheA(3)-P. The structure presented here has allowed us to identify specificity determinants for the CheA-CheY interaction and subsequently to successfully reengineer phosphotransfer signalling. In summary, our results provide valuable insight into how cells mediate specificity in one of the most abundant signalling pathways in biology, two-component signal transduction.", "title": "Using Structural Information to Change the Phosphotransfer Specificity of a Two-Component Chemotaxis Signalling Complex", "_id": "11603066"} +{"text": "Half the world's population is chronically infected with Helicobacter pylori, causing gastritis, gastric ulcers and an increased incidence of gastric adenocarcinoma. Its proton-gated inner-membrane urea channel, HpUreI, is essential for survival in the acidic environment of the stomach. The channel is closed at neutral pH and opens at acidic pH to allow the rapid access of urea to cytoplasmic urease. Urease produces NH(3) and CO(2), neutralizing entering protons and thus buffering the periplasm to a pH of roughly 6.1 even in gastric juice at a pH below 2.0. Here we report the structure of HpUreI, revealing six protomers assembled in a hexameric ring surrounding a central bilayer plug of ordered lipids. Each protomer encloses a channel formed by a twisted bundle of six transmembrane helices. The bundle defines a previously unobserved fold comprising a two-helix hairpin motif repeated three times around the central axis of the channel, without the inverted repeat of mammalian-type urea transporters. Both the channel and the protomer interface contain residues conserved in the AmiS/UreI superfamily, suggesting the preservation of channel architecture and oligomeric state in this superfamily. Predominantly aromatic or aliphatic side chains line the entire channel and define two consecutive constriction sites in the middle of the channel. Mutation of Trp 153 in the cytoplasmic constriction site to Ala or Phe decreases the selectivity for urea in comparison with thiourea, suggesting that solute interaction with Trp 153 contributes specificity. The previously unobserved hexameric channel structure described here provides a new model for the permeation of urea and other small amide solutes in prokaryotes and archaea.", "title": "Structure of the proton-gated urea channel from the gastric pathogen Helicobacter pylori", "_id": "4387784"} +{"text": "Embryos have the ability to self-regulate and regenerate normal structures after being sectioned in half. How is such a morphogenetic field established? We discovered that quadruple knockdown of ADMP and BMP2/4/7 in Xenopus embryos eliminates self-regulation, causing ubiquitous neural induction throughout the ectoderm. ADMP transcription in the Spemann organizer is activated at low BMP levels. When ventral BMP2/4/7 signals are depleted, Admp expression increases, allowing for self-regulation. ADMP has BMP-like activity and signals via the ALK-2 receptor. It is unable to signal dorsally because of inhibition by Chordin. The ventral BMP antagonists Sizzled and Bambi further refine the pattern. By transplanting dorsal or ventral wild-type grafts into ADMP/BMP2/4/7-depleted hosts, we demonstrate that both poles serve as signaling centers that can induce histotypic differentiation over considerable distances. We conclude that dorsal and ventral BMP signals and their extracellular antagonists expressed under opposing transcriptional regulation provide a molecular mechanism for embryonic self-regulation.", "title": "Regulation of ADMP and BMP2/4/7 at Opposite Embryonic Poles Generates a Self-Regulating Morphogenetic Field", "_id": "6076903"} diff --git a/sample-data/scifact/qrels/test.tsv b/sample-data/scifact/qrels/test.tsv new file mode 100644 index 0000000..df7ab1f --- /dev/null +++ b/sample-data/scifact/qrels/test.tsv @@ -0,0 +1,4 @@ +query-id corpus-id score +1019 11603066 1 +75 4387784 1 +72 6076903 1 diff --git a/sample-data/scifact/queries-test.tsv b/sample-data/scifact/queries-test.tsv new file mode 100644 index 0000000..52e3873 --- /dev/null +++ b/sample-data/scifact/queries-test.tsv @@ -0,0 +1,3 @@ +1019 Rapid phosphotransfer rates govern fidelity in two component systems +75 Active H. pylori urease has a polymeric structure that compromises two subunits, UreA and UreB. +72 Activator-inhibitor pairs are provided dorsally by Admpchordin. diff --git a/sample-data/scifact/queries.jsonl b/sample-data/scifact/queries.jsonl new file mode 100644 index 0000000..845a0bc --- /dev/null +++ b/sample-data/scifact/queries.jsonl @@ -0,0 +1,3 @@ +{"_id": "1019", "text": "Rapid phosphotransfer rates govern fidelity in two component systems", "metadata": {}} +{"_id": "75", "text": "Active H. pylori urease has a polymeric structure that compromises two subunits, UreA and UreB.", "metadata": {}} +{"_id": "72", "text": "Activator-inhibitor pairs are provided dorsally by Admpchordin.", "metadata": {}} diff --git a/sprint/__init__.py b/sprint/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sprint/inference/methods/sparta.py b/sprint/inference/methods/sparta.py index b46315e..deae0e9 100644 --- a/sprint/inference/methods/sparta.py +++ b/sprint/inference/methods/sparta.py @@ -8,7 +8,9 @@ class SPARTADocumentEncoder(torch.nn.Module, DocumentEncoder): - def __init__(self, model_name, device): #SpanBERT/spanbert-base-cased'): #bert-base-uncased #distilbert-base-uncased #distilroberta-base + def __init__( + self, model_name, device + ): # SpanBERT/spanbert-base-cased'): #bert-base-uncased #distilbert-base-uncased #distilroberta-base super().__init__() print("Model name:", model_name) self.bert_model = AutoModel.from_pretrained(model_name) @@ -18,7 +20,9 @@ def __init__(self, model_name, device): #SpanBERT/spanbert-base-cased'): #bert-b self.device = device self.max_length = 300 ##### - self.bert_input_emb = self.bert_model.embeddings.word_embeddings(torch.tensor(list(range(0, len(self.tokenizer))), device=device)) # for building term weights + self.bert_input_emb = self.bert_model.embeddings.word_embeddings( + torch.tensor(list(range(0, len(self.tokenizer))), device=device) + ) # for building term weights self.reverse_voc = {v: k for k, v in self.tokenizer.vocab.items()} self.special_token_embedding_to_zero = False # used during inference @@ -26,39 +30,60 @@ def bert_embeddings(self, input_ids): return self.bert_model.embeddings.word_embeddings(input_ids) def query_embeddings(self, query): - queries_batch = self.tokenizer(query, padding=True, truncation=True, return_tensors='pt', add_special_tokens=False, max_length=self.max_length).to(self.device) - queries_embeddings = self.bert_embeddings(queries_batch['input_ids']) + queries_batch = self.tokenizer( + query, + padding=True, + truncation=True, + return_tensors="pt", + add_special_tokens=False, + max_length=self.max_length, + ).to(self.device) + queries_embeddings = self.bert_embeddings(queries_batch["input_ids"]) return queries_embeddings def passage_embeddings(self, passages): - passage_batch = self.tokenizer(passages, padding=True, truncation=True, return_tensors='pt', max_length=self.max_length).to(self.device) + passage_batch = self.tokenizer( + passages, + padding=True, + truncation=True, + return_tensors="pt", + max_length=self.max_length, + ).to(self.device) passage_embeddings = self.bert_model(**passage_batch).last_hidden_state return passage_embeddings def compute_scores(self, query_embeddings, passage_embeddings): ### Eq. 4 - Term matching scores = [] - for idx in range(len(query_embeddings)): #TODO: use correct pytorch function for this - scores.append(torch.matmul(query_embeddings[idx], passage_embeddings.transpose(1, 2))) + for idx in range( + len(query_embeddings) + ): # TODO: use correct pytorch function for this + scores.append( + torch.matmul(query_embeddings[idx], passage_embeddings.transpose(1, 2)) + ) scores = torch.stack(scores) - #print("Scores:", scores.shape) + # print("Scores:", scores.shape) max_scores = torch.max(scores, dim=-1).values - #print("Max-Scores:", max_scores.shape) + # print("Max-Scores:", max_scores.shape) ### Eq. 5 - ReLu - relu_scores = torch.relu(max_scores) #torch.relu(max_scores + self.score_bias) #Bias score does not change that much? - #print("ReLu-Scores:", relu_scores.shape) + relu_scores = torch.relu( + max_scores + ) # torch.relu(max_scores + self.score_bias) #Bias score does not change that much? + # print("ReLu-Scores:", relu_scores.shape) ### Eq. 6 - Final Score - final_scores = torch.sum(torch.log(relu_scores + 1), dim=-1) #.unsqueeze(dim=0) - #print("Final scores:", final_scores.shape) + final_scores = torch.sum( + torch.log(relu_scores + 1), dim=-1 + ) # .unsqueeze(dim=0) + # print("Final scores:", final_scores.shape) return final_scores def forward(self, queries, passages): query_embeddings = self.query_embeddings(queries) passage_embeddings = self.passage_embeddings(passages) return self.compute_scores(query_embeddings, passage_embeddings) - + ### def _set_special_token_embedding_to_zero(self): if self.bert_model.training == True: @@ -66,32 +91,38 @@ def _set_special_token_embedding_to_zero(self): if self.special_token_embedding_to_zero: return - + for special_id in self.tokenizer.all_special_ids: self.bert_input_emb[special_id] = 0 * self.bert_input_emb[special_id] - + self.special_token_embedding_to_zero = True - + ### def encode(self, texts, **kwargs): self._set_special_token_embedding_to_zero() # Important for full reproduction (although it seems to have little influence on the performance) - + term_weights_batch = [] - sparse_vec_size = kwargs.setdefault('sparse_vec_size', 2000) # TODO: Make this into the search.py cli arguments + sparse_vec_size = kwargs.setdefault( + "sparse_vec_size", 2000 + ) # TODO: Make this into the search.py cli arguments assert sparse_vec_size <= len(self.tokenizer) - tokens = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=500).to(self.device) + tokens = self.tokenizer( + texts, padding=True, truncation=True, return_tensors="pt", max_length=500 + ).to(self.device) passage_embeddings = self.bert_model(**tokens).last_hidden_state - for passage_emb in passage_embeddings: # TODO: Optimize this by batch operations + for ( + passage_emb + ) in passage_embeddings: # TODO: Optimize this by batch operations scores = torch.matmul(self.bert_input_emb, passage_emb.transpose(0, 1)) max_scores = torch.max(scores, dim=-1).values - relu_scores = torch.relu(max_scores) #Eq. 5 + relu_scores = torch.relu(max_scores) # Eq. 5 final_scores = torch.log(relu_scores + 1) # Eq. 6, final score top_results = torch.topk(final_scores, k=sparse_vec_size) tids = top_results[1].cpu().detach().tolist() scores = top_results[0].cpu().detach().tolist() - + term_weights = {} for tid, score in zip(tids, scores): if score > 0: @@ -100,23 +131,23 @@ def encode(self, texts, **kwargs): break term_weights_batch.append(term_weights) - + return term_weights_batch -class SPARTAQueryEncoder(QueryEncoder): - def __init__(self, model_name_or_path, device='cpu'): +class SPARTAQueryEncoder(QueryEncoder): + def __init__(self, model_name_or_path, device="cpu"): self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) self.reverse_voc = {v: k for k, v in self.tokenizer.vocab.items()} def encode(self, text, **kwargs): - token_ids = self.tokenizer(text, add_special_tokens=False)['input_ids'] + token_ids = self.tokenizer(text, add_special_tokens=False)["input_ids"] tokens = [self.reverse_voc[token_id] for token_id in token_ids] term_weights = defaultdict(int) - + # Important for reproducing the results: # Note that in Pyserini/Anserini, the query term weights are maintained by JHashMap, # which will keep only one term weight for identical terms for token in tokens: term_weights[token] += 1 - return term_weights \ No newline at end of file + return term_weights diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..13f8aeb --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,107 @@ +import shutil +import pytest +import tempfile +import os + + +@pytest.fixture(name="bert_path", scope="session") +def bert_path_fixture() -> str: + try: + local_dir = tempfile.mkdtemp() + + import torch + + torch.manual_seed(42) + vocab = [ + "[PAD]", + "[UNK]", + "[CLS]", + "[SEP]", + "[MASK]", + "the", + "of", + "and", + "in", + "to", + "was", + "he", + ] + vocab_file = os.path.join(local_dir, "vocab.txt") + with open(vocab_file, "w") as f: + f.write("\n".join(vocab)) + + from transformers import BertConfig, BertModel, BertTokenizer + + config = BertConfig( + vocab_size=len(vocab), + hidden_size=2, + num_attention_heads=1, + num_hidden_layers=2, + intermediate_size=2, + max_position_embeddings=512, + ) + + bert = BertModel(config) + tokenizer = BertTokenizer(vocab_file) + + bert.save_pretrained(local_dir) + tokenizer.save_pretrained(local_dir) + + yield local_dir + finally: + shutil.rmtree(local_dir) + print("Cleared temporary DistilBERT model") + + +@pytest.fixture(name="distilbert_path", scope="session") +def distilbert_path_fixture() -> str: + try: + local_dir = tempfile.mkdtemp() + + import torch + + torch.manual_seed(42) + vocab = [ + "[PAD]", + "[UNK]", + "[CLS]", + "[SEP]", + "[MASK]", + "the", + "of", + "and", + "in", + "to", + "was", + "he", + ] + vocab_file = os.path.join(local_dir, "vocab.txt") + with open(vocab_file, "w") as f: + f.write("\n".join(vocab)) + + from transformers import DistilBertConfig, DistilBertModel, DistilBertTokenizer + + config = DistilBertConfig( + vocab_size=len(vocab), + hidden_size=2, + num_attention_heads=1, + num_hidden_layers=2, + intermediate_size=2, + max_position_embeddings=512, + ) + + bert = DistilBertModel(config) + tokenizer = DistilBertTokenizer(vocab_file) + + bert.save_pretrained(local_dir) + tokenizer.save_pretrained(local_dir) + + yield local_dir + finally: + shutil.rmtree(local_dir) + print("Cleared temporary DistilBERT model") + + +@pytest.fixture(name="scifact_path", scope="session") +def scifact_path_fixture() -> str: + return "sample-data/scifact" diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/test_inference.py b/tests/integration/test_inference.py new file mode 100644 index 0000000..ab2469b --- /dev/null +++ b/tests/integration/test_inference.py @@ -0,0 +1,45 @@ +import os +import shutil +import pytest +from sprint.inference import aio + + +@pytest.mark.parametrize( + "ckpt_name, encoder_name", + [ + ("bert_path", "unicoil"), + ("distilbert_path", "splade"), + ("distilbert_path", "sparta"), + ("bert_path", "deepimpact"), + ], +) +def test_aio( + ckpt_name: str, encoder_name: str, scifact_path: str, request: pytest.FixtureRequest +) -> None: + ckpt_name = request.getfixturevalue(ckpt_name) + output_dir = "pytest-output" + output_quantized_dir = "pytest-output-quantized" + try: + aio.run( + encoder_name=encoder_name, + ckpt_name=ckpt_name, + data_name="beir/scifact", + train_data_dir=scifact_path, + eval_data_dir=scifact_path, + gpus=["cpu"], + output_dir=output_dir, + do_quantization=True, + quantization_method="range-nbits", # So the doc term weights will be quantized by `(term_weights / 5) * (2 ** 8)` + original_score_range=5, + quantization_nbits=8, + original_query_format="beir", + topic_split="test", + ) + # You would get "NDCG@10": 0.68563 + finally: + for dir in [output_dir, output_quantized_dir]: + if os.path.exists(dir): + shutil.rmtree(dir) + + +# TODO: Add reranking tests