Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 52 additions & 7 deletions src/pubget/_labelbuddy.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@

# Body
{body}

# Table(s)

{tables}
"""


Expand Down Expand Up @@ -85,17 +89,43 @@ def _format_authors(doc_authors: pd.DataFrame) -> str:
)


def _format_tables(doc_tables: pd.DataFrame, root_dir: Path) -> str:
"""Display tables in a string."""
table_texts = []
for _, table_info in doc_tables.iterrows():
table_path = root_dir.joinpath(table_info["table_data_file"])
table_id = "None" if pd.isna(table_info["table_id"]) else \
table_info["table_id"]
table_label = "None" if pd.isna(table_info["table_label"]) else \
table_info["table_label"]
table_str = f"## ID: {table_id}\n\n### Label: {table_label}\n\n"
df = pd.read_csv(table_path, encoding="utf-8")
table_str += df.to_string() + "\n\n"
table_caption = "None" if pd.isna(table_info["table_caption"]) else \
table_info["table_caption"]
table_foot = "None" if pd.isna(table_info["table_foot"]) else \
table_info["table_foot"]
table_str += f"### Caption\n\n{table_caption}\n\n"
table_str += f"### Footer\n\n{table_foot}\n"
table_texts.append(table_str)

return "\n\n\n".join(table_texts)


def _prepare_document(
doc_text: pd.Series,
doc_meta: pd.Series,
doc_tables: pd.DataFrame,
doc_authors: pd.DataFrame,
root_dir: Path,
batch: int,
) -> Dict[str, Any]:
"""Extract information for one article and prepare labelbuddy document."""
doc_text = doc_text.fillna("")
doc_info: Dict[str, Any] = {}
fields = {**doc_text, **doc_meta}
fields["authors"] = _format_authors(doc_authors)
fields["tables"] = _format_tables(doc_tables, root_dir)
doc_info["text"] = _TEMPLATE.format(**fields)
doc_info["metadata"] = {
"pmcid": int(doc_meta["pmcid"]),
Expand All @@ -122,8 +152,13 @@ def _prepare_document(


def _iter_corpus(
text_fh: TextIO, metadata_fh: TextIO, authors: pd.DataFrame
) -> Generator[Tuple[pd.Series, pd.Series, pd.DataFrame], None, None]:
text_fh: TextIO,
metadata_fh: TextIO,
tables: pd.DataFrame,
authors: pd.DataFrame
) -> Generator[
Tuple[pd.Series, pd.Series, pd.DataFrame, pd.DataFrame], None, None
]:
"""Iterate over articles and provide text, metadata, authors."""
all_text_chunks = pd.read_csv(text_fh, chunksize=200)
all_metadata_chunks = pd.read_csv(metadata_fh, chunksize=200)
Expand All @@ -136,17 +171,21 @@ def _iter_corpus(
):
n_articles += 1
assert doc_meta["pmcid"] == doc_text["pmcid"]
doc_tables = tables[tables["pmcid"] == doc_meta["pmcid"]]
doc_authors = authors[authors["pmcid"] == doc_meta["pmcid"]]
if not n_articles % _LOG_PERIOD:
_LOG.info(f"Read {n_articles} articles.")
yield doc_text, doc_meta, doc_authors
yield doc_text, doc_meta, doc_tables, doc_authors
_LOG.info(f"Read {n_articles} articles.")


def _write_labelbuddy_batch(
all_docs: Iterator[Tuple[pd.Series, pd.Series, pd.DataFrame]],
all_docs: Iterator[
Tuple[pd.Series, pd.Series, pd.DataFrame, pd.DataFrame]
],
batch_nb: int,
batch_size: Optional[int],
root_dir: Path,
output_dir: Path,
) -> None:
"""Write labelbuddy documents to jsonl file.
Expand All @@ -164,7 +203,11 @@ def _write_labelbuddy_batch(
while batch_size is None or n_written != batch_size:
doc_info = next(all_docs)
out_f.write(
json.dumps(_prepare_document(*doc_info, batch=batch_nb))
json.dumps(
_prepare_document(
*doc_info, root_dir=root_dir, batch=batch_nb
)
)
)
out_f.write("\n")
row = (int(doc_info[1]["pmcid"]), batch_file.name, n_written)
Expand All @@ -180,19 +223,21 @@ def _do_make_labelbuddy_documents(
extracted_data_dir: Path, output_dir: Path, batch_size: Optional[int]
) -> None:
"""Perform the creation of the labelbuddy jsonl files."""
root_dir = Path(extracted_data_dir).parent
text_file = extracted_data_dir.joinpath("text.csv")
metadata_file = extracted_data_dir.joinpath("metadata.csv")
authors = pd.read_csv(extracted_data_dir.joinpath("authors.csv"))
tables = pd.read_csv(extracted_data_dir.joinpath("tables.csv"))
output_dir.joinpath("batch_info.csv").write_text("pmcid,file_name,line\n")
with open(text_file, encoding="utf-8") as text_fh, open(
metadata_file, encoding="utf-8"
) as metadata_fh:
all_docs = _iter_corpus(text_fh, metadata_fh, authors)
all_docs = _iter_corpus(text_fh, metadata_fh, tables, authors)
batch_nb = 1
while True:
try:
_write_labelbuddy_batch(
all_docs, batch_nb, batch_size, output_dir
all_docs, batch_nb, batch_size, root_dir, output_dir
)
except StopIteration:
return
Expand Down
28 changes: 25 additions & 3 deletions tests/data/articleset.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5032,8 +5032,30 @@ brains, auditory cortex, memory, memory<table-wrap><table><tr><th>X</th><th>Y</t
</custom-meta-group>
</article-meta>
</front>
<body>The text of the article with coordinates,
brains, auditory cortex, memory, memory<table-wrap><table><tr><th>X</th><th>Y</th><th>Z</th></tr><tr><td>10</td><td>20</td><td>30</td></tr><tr><td>-10</td><td>-20</td><td>-30</td></tr></table></table-wrap></body>
<body>
The text of the article with coordinates, brains, auditory cortex, memory, memory.
<table-wrap>
<table>
<tr>
<th>X</th><th>Y</th><th>Z</th>
<th>Region</th><th>Activation_Level</th><th>Voxel_Count</th><th>Signal_Noise_Ratio</th><th>Timepoint</th>
<th>Intensity</th><th>Subject_ID</th><th>Confidence_Score</th><th>Cluster_Size</th><th>P_Value</th>
<th>Q_Value</th><th>Correlation</th><th>Noise_Level</th><th>Hemisphere</th><th>Mean_Signal</th>
<th>Std_Dev</th><th>Median_Signal</th><th>Session_ID</th><th>Scan_Type</th><th>Experiment_ID</th>
</tr>
<tr><td>10</td><td>20</td><td>30</td><td>Hippocampus</td><td>1.5</td><td>150</td><td>3.2</td><td>5</td><td>2.3</td><td>101</td><td>0.95</td><td>12</td><td>0.05</td><td>0.07</td><td>0.85</td><td>0.2</td><td>Left</td><td>1.7</td><td>0.5</td><td>1.6</td><td>SS01</td><td>fMRI</td><td>EXP001</td></tr>
<tr><td>-10</td><td>-20</td><td>-30</td><td>Prefrontal Cortex</td><td>2.1</td><td>200</td><td>2.8</td><td>10</td><td>1.9</td><td>102</td><td>0.88</td><td>14</td><td>0.03</td><td>0.09</td><td>0.75</td><td>0.3</td><td>Right</td><td>2.1</td><td>0.4</td><td>1.9</td><td>SS02</td><td>MEG</td><td>EXP002</td></tr>
<tr><td>15</td><td>25</td><td>35</td><td>Auditory Cortex</td><td>3.0</td><td>120</td><td>3.5</td><td>15</td><td>3.1</td><td>103</td><td>0.97</td><td>16</td><td>0.01</td><td>0.08</td><td>0.65</td><td>0.1</td><td>Left</td><td>2.5</td><td>0.3</td><td>2.4</td><td>SS03</td><td>EEG</td><td>EXP003</td></tr>
<tr><td>-15</td><td>-25</td><td>-35</td><td>Visual Cortex</td><td>1.8</td><td>180</td><td>2.9</td><td>20</td><td>2.2</td><td>104</td><td>0.89</td><td>13</td><td>0.02</td><td>0.06</td><td>0.72</td><td>0.2</td><td>Right</td><td>1.9</td><td>0.6</td><td>1.8</td><td>SS04</td><td>fMRI</td><td>EXP004</td></tr>
<tr><td>20</td><td>30</td><td>40</td><td>Thalamus</td><td>2.5</td><td>160</td><td>3.1</td><td>25</td><td>2.8</td><td>105</td><td>0.92</td><td>18</td><td>0.04</td><td>0.05</td><td>0.78</td><td>0.25</td><td>Left</td><td>2.2</td><td>0.4</td><td>2.1</td><td>SS05</td><td>MEG</td><td>EXP005</td></tr>
<tr><td>-20</td><td>-30</td><td>-40</td><td>Parietal Cortex</td><td>2.0</td><td>170</td><td>3.0</td><td>30</td><td>2.7</td><td>106</td><td>0.90</td><td>17</td><td>0.03</td><td>0.07</td><td>0.80</td><td>0.22</td><td>Right</td><td>2.0</td><td>0.5</td><td>1.9</td><td>SS06</td><td>EEG</td><td>EXP006</td></tr>
<tr><td>25</td><td>35</td><td>45</td><td>Medial Temporal</td><td>3.3</td><td>130</td><td>3.3</td><td>35</td><td>3.0</td><td>107</td><td>0.94</td><td>15</td><td>0.02</td><td>0.06</td><td>0.83</td><td>0.3</td><td>Left</td><td>2.3</td><td>0.7</td><td>2.2</td><td>SS07</td><td>fMRI</td><td>EXP007</td></tr>
<tr><td>-25</td><td>-35</td><td>-45</td><td>Occipital Cortex</td><td>1.7</td><td>190</td><td>2.7</td><td>40</td><td>1.8</td><td>108</td><td>0.91</td><td>19</td><td>0.03</td><td>0.08</td><td>0.70</td><td>0.1</td><td>Right</td><td>1.5</td><td>0.3</td><td>1.4</td><td>SS08</td><td>MEG</td><td>EXP008</td></tr>
<tr><td>30</td><td>40</td><td>50</td><td>Cerebellum</td><td>2.4</td><td>140</td><td>3.6</td><td>45</td><td>2.9</td><td>109</td><td>0.93</td><td>11</td><td>0.05</td><td>0.07</td><td>0.84</td><td>0.3</td><td>Left</td><td>2.0</td><td>0.4</td><td>1.8</td><td>SS09</td><td>EEG</td><td>EXP009</td></tr>
<tr><td>-30</td><td>-40</td><td>-50</td><td>Basal Ganglia</td><td>1.9</td><td>210</td><td>2.5</td><td>50</td><td>1.6</td><td>110</td><td>0.87</td><td>20</td><td>0.06</td><td>0.05</td><td>0.82</td><td>0.2</td><td>Right</td><td>1.7</td><td>0.6</td><td>1.5</td><td>SS10</td><td>fMRI</td><td>EXP010</td></tr>
</table>
</table-wrap>
</body>
<back>
<fn-group>
<fn>
Expand Down Expand Up @@ -7695,4 +7717,4 @@ brains, auditory cortex, memory, memory<table-wrap><table><tr><th>X</th><th>Y</t
</ref-list>
</back>
</article>
</pmc-articleset>
</pmc-articleset>
1 change: 1 addition & 0 deletions tests/test_labelbuddy.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def _check_batch_info(labelbuddy_dir):
"title": "fmri",
"publication_year": 2000,
"journal": "Journ. Brain. Imag.",
"tables": "x\ty\tz",
},
),
(
Expand Down
Loading