Skip to content

Commit

Permalink
📊 cancer: NHS stats on diagnosis routes and survival rates (#3290)
Browse files Browse the repository at this point in the history
  • Loading branch information
veronikasamborska1994 authored Sep 16, 2024
1 parent e601a70 commit 331e28a
Show file tree
Hide file tree
Showing 19 changed files with 719 additions and 1 deletion.
24 changes: 23 additions & 1 deletion dag/health.yml
Original file line number Diff line number Diff line change
Expand Up @@ -784,4 +784,26 @@ steps:
data://garden/who/2024-09-09/flu_test:
- data://meadow/who/latest/flunet
data://grapher/who/2024-09-09/flu_test:
- data://garden/who/2024-09-09/flu_test
- data://garden/who/2024-09-09/flu_test

# Cancer diagnosis routes and survival rates
data://meadow/cancer/2024-09-13/diagnosis_routes_by_route:
- snapshot://cancer/2024-09-13/diagnosis_routes_by_route.csv
data://garden/cancer/2024-09-13/diagnosis_routes_by_route:
- data://meadow/cancer/2024-09-13/diagnosis_routes_by_route
data://grapher/cancer/2024-09-13/diagnosis_routes_by_route:
- data://garden/cancer/2024-09-13/diagnosis_routes_by_route

data://meadow/cancer/2024-09-13/diagnosis_routes_by_stage:
- snapshot://cancer/2024-09-13/diagnosis_routes_by_stage.csv
data://garden/cancer/2024-09-13/diagnosis_routes_by_stage:
- data://meadow/cancer/2024-09-13/diagnosis_routes_by_stage
data://grapher/cancer/2024-09-13/diagnosis_routes_by_stage:
- data://garden/cancer/2024-09-13/diagnosis_routes_by_stage

data://meadow/cancer/2024-09-13/diagnosis_routes_survival:
- snapshot://cancer/2024-09-13/diagnosis_routes_survival.csv
data://garden/cancer/2024-09-13/diagnosis_routes_survival:
- data://meadow/cancer/2024-09-13/diagnosis_routes_survival
data://grapher/cancer/2024-09-13/diagnosis_routes_survival:
- data://garden/cancer/2024-09-13/diagnosis_routes_survival
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# NOTE: To learn more about the fields, hover over their names.
definitions:
common:
presentation:
topic_tags:
- Cancer

diag_route: &diag_route |-
<% if route == "Screening" %>
Screening is flagged by the cancer registry as detected via the breast, bowel, or cervical screening programmes.
<% elif route == "USC" %>
USC (Urgent Suspected Cancer) refers to urgent GP referrals with a suspicion of cancer (previously known as two-week wait/TWW).
<% elif route == "GP referral" %>
Routine and urgent referrals where the patient was not referred under the USC (Urgent Suspected Cancer) referral route
<% elif route == "Emergency presentation" %>
An emergency route via accident and emergency (A&E), emergency GP referral, emergency transfer, emergency admission or attendance.
<% elif route == "Other outpatient" %>
An elective route starting with an outpatient appointment that is either a self-referral, consultant to consultant referral, other or unknown referral (excludes patients originally referred under the USC referral route)
<% elif route == "Inpatient elective" %>
No earlier information can be found prior to admission from a waiting list, booked or planned.
<% elif route == "Unknown route" %>
No relevant data available from Inpatient or Outpatient Hospital Episode Statistics, National Cancer Waiting Times or National Screening Programmes.
<% endif %>
# Learn more about the available fields:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
dataset:
update_period_days: 365



tables:
diagnosis_routes_by_route:
variables:
count_by_route:
title: Number of << stage.lower() >> cancer diagnoses via the << route.lower() >> route
description_key:
- *diag_route
unit: cases
display:
numDecimalPlaces: 0
name: << route >>
percentage_by_route:
title: Share of << stage.lower() >> cancer diagnoses via the << route.lower() >> route
description_key:
- *diag_route
unit: '%'
short_unit: '%'
display:
numDecimalPlaces: 1
name: << route >>
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Load a meadow dataset and create a garden dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load meadow dataset.
ds_meadow = paths.load_dataset("diagnosis_routes_by_route")

# Read table from meadow dataset.
tb = ds_meadow["diagnosis_routes_by_route"].reset_index()

#
# Process data.
#
tb["route"] = tb["route"].str.replace(r"^\d+\s", "", regex=True)
tb = tb.format(["country", "year", "site", "stage", "route"])

#
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
ds_garden = create_dataset(
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
)

# Save changes in the new garden dataset.
ds_garden.save()
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# NOTE: To learn more about the fields, hover over their names.
definitions:
common:
presentation:
topic_tags:
- Cancer
diag_route: &diag_route |-
<% if route == "Screening" %>
Screening is flagged by the cancer registry as detected via the breast, bowel, or cervical screening programmes.
<% elif route == "USC" %>
USC (Urgent Suspected Cancer) refers to urgent GP referrals with a suspicion of cancer (previously known as two-week wait/TWW).
<% elif route == "GP referral" %>
Routine and urgent referrals where the patient was not referred under the USC (Urgent Suspected Cancer) referral route
<% elif route == "Emergency presentation" %>
An emergency route via accident and emergency (A&E), emergency GP referral, emergency transfer, emergency admission or attendance.
<% elif route == "Other outpatient" %>
An elective route starting with an outpatient appointment that is either a self-referral, consultant to consultant referral, other or unknown referral (excludes patients originally referred under the USC referral route)
<% elif route == "Inpatient elective" %>
No earlier information can be found prior to admission from a waiting list, booked or planned.
<% elif route == "Unknown route" %>
No relevant data available from Inpatient or Outpatient Hospital Episode Statistics, National Cancer Waiting Times or National Screening Programmes.
<% endif %>
# Learn more about the available fields:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
dataset:
update_period_days: 365



tables:
diagnosis_routes_by_stage:
variables:
count_by_stage:
title: Number of cancer diagnoses via the << route.lower() >> route that are at << stage.lower() >>
description_key:
- *diag_route
display:
numDecimalPlaces: 0
name: << stage >>
unit: cases
percentage_by_stage:
title: Share of cancer diagnoses via the << route.lower() >> route that are at << stage.lower() >>
description_key:
- *diag_route
unit: '%'
short_unit: '%'
display:
numDecimalPlaces: 1
name: << stage >>
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Load a meadow dataset and create a garden dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load meadow dataset.
ds_meadow = paths.load_dataset("diagnosis_routes_by_stage")

# Read table from meadow dataset.
tb = ds_meadow["diagnosis_routes_by_stage"].reset_index()

#
# Process data.
#
tb["route"] = tb["route"].str.replace(r"^\d+\s", "", regex=True)
tb = tb.format(["country", "year", "site", "stage", "route"])

#
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
ds_garden = create_dataset(
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
)

# Save changes in the new garden dataset.
ds_garden.save()
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# NOTE: To learn more about the fields, hover over their names.
definitions:
common:
presentation:
topic_tags:
- Cancer
grapher_config:
note: The year represents the start of the 5-year age-standardized survival estimates by cancer site for 5-year rolling cohorts from 2006-2010 to 2016-2020.

diag_route: &diag_route |-
<% if route == "Screening" %>
Screening is flagged by the cancer registry as detected via the breast, bowel, or cervical screening programmes.
<% elif route == "USC" %>
USC (Urgent Suspected Cancer) refers to urgent GP referrals with a suspicion of cancer (previously known as two-week wait/TWW).
<% elif route == "GP referral" %>
Routine and urgent referrals where the patient was not referred under the USC (Urgent Suspected Cancer) referral route
<% elif route == "Emergency presentation" %>
An emergency route via accident and emergency (A&E), emergency GP referral, emergency transfer, emergency admission or attendance.
<% elif route == "Other outpatient" %>
An elective route starting with an outpatient appointment that is either a self-referral, consultant to consultant referral, other or unknown referral (excludes patients originally referred under the USC referral route)
<% elif route == "Inpatient elective" %>
No earlier information can be found prior to admission from a waiting list, booked or planned.
<% elif route == "Unknown route" %>
No relevant data available from Inpatient or Outpatient Hospital Episode Statistics, National Cancer Waiting Times or National Screening Programmes.
<% endif %>
sex: |-
<% if gender == "Persons" %>all<% elif gender == "Male" %>male<% elif gender == "Female" %>female<% endif %>
# Learn more about the available fields:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
dataset:
update_period_days: 365



tables:
diagnosis_routes_survival:
variables:
patients:
title: Number of {definitions.sex} patients diagnosed via << route.lower() >> surviving at the end of the << length.lower() >> period
description_short: |-
The number of {definitions.sex} patients diagnosed via << route.lower() >> route surviving at the end of the << length.lower() >> period.
description_key:
- *diag_route
unit: cases
display:
numDecimalPlaces: 0
name: << route >>
presentation:
title_public: Number of {definitions.sex} patients diagnosed via << route.lower() >> surviving at the end of the << length.lower() >> period

survival:
title: Age-standardized << length.lower() >> survival rate diagnosed via << route.lower() >> among {definitions.sex} patients
description_short: |-
The age-standardized << length.lower() >> survival rate diagnosed via << route.lower() >> route among {definitions.sex} patients.
description_key:
- *diag_route
unit: '%'
short_unit: '%'
display:
numDecimalPlaces: 1
name: << route >>
presentation:
title_public: Age-standardized << length.lower() >> survival rate diagnosed via << route.lower() >> among {definitions.sex} patients
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Load a meadow dataset and create a garden dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load meadow dataset.
ds_meadow = paths.load_dataset("diagnosis_routes_survival")

# Read table from meadow dataset.
tb = ds_meadow["diagnosis_routes_survival"].reset_index()

#
# Process data.
#

# Extract the last year from the 'year' column which is in the format '2006-2010'.
tb["year"] = tb["year"].apply(lambda x: int(x.split("-")[0])).astype(int)
tb["route"] = tb["route"].str.replace(r"^\d+\s", "", regex=True)
tb = tb.format(["country", "year", "site", "gender", "route", "length"])

#
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
ds_garden = create_dataset(
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
)

# Save changes in the new garden dataset.
ds_garden.save()
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""Load a garden dataset and create a grapher dataset."""

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load garden dataset.
ds_garden = paths.load_dataset("diagnosis_routes_by_route")

# Read table from garden dataset.
tb = ds_garden["diagnosis_routes_by_route"].reset_index()
tb = tb.drop(columns=["country"])

# Define mapping dictionary with only the first word capitalized
cancer_mapping = {
"All Malignant Neoplasms (excl. NMSC)": "All malignant neoplasms (excl. NMSC)",
"Bladder": "Bladder cancer",
"Breast": "Breast cancer",
"Cervix": "Cervical cancer",
"Colorectal": "Colorectal cancer",
"Kidney": "Kidney cancer",
"Lung - non-small cell": "Lung cancer (non-small cell)",
"Lung - small cell": "Lung cancer (small cell)",
"Ovary": "Ovarian cancer",
"Pancreas": "Pancreatic cancer",
"Prostate": "Prostate cancer",
"Uterus": "Uterine cancer",
}

# Map cancer types to descriptive labels
tb["site"] = tb["site"].map(cancer_mapping)

# Make cancer type appear as country.
tb = tb.rename(columns={"site": "country"})
tb = tb.format(["country", "year", "stage", "route"])
#
# Save outputs.
#
# Create a new grapher dataset with the same metadata as the garden dataset.
ds_grapher = create_dataset(
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_garden.metadata
)

# Save changes in the new grapher dataset.
ds_grapher.save()
Loading

0 comments on commit 331e28a

Please sign in to comment.