Skip to content

Commit d9f55ca

Browse files
update version with shared logic
1 parent e6d044e commit d9f55ca

File tree

325 files changed

+18766
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

325 files changed

+18766
-0
lines changed

pipeline_logic/v2/act/transform-python/src/act/__init__.py

Whitespace-only changes.

pipeline_logic/v2/act/transform-python/src/act/anchor/__init__.py

Whitespace-only changes.
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
"""
2+
This is a very precisely created file, do not change it. It was created to trick Foundry Templates into giving us the
3+
path of the root folder of the deployed template. In generate-anchor.py, we use the anchor path defined in path.py to
4+
create a dummy anchor dataset at the root of the project. Then when a new instance of the template is deployed, this
5+
anchor path is automatically replaced with the path of the anchor dataset in the deployed template. Then to get the
6+
root, we simply remove the name "anchor". Finally, we can use this root path in the rest of the repo. Doing this
7+
allowed us to massively de-duplicate repeated code, in some steps reducing the number of lines of code by more than 90%.
8+
"""
9+
10+
from transforms.api import transform_df, Output
11+
from act.anchor import path
12+
13+
14+
@transform_df(
15+
Output(path.anchor)
16+
)
17+
def compute(ctx):
18+
return ctx.spark_session.range(1)
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
"""
2+
This is a very precisely created file, do not change it. It was created to trick Foundry Templates into giving us the
3+
path of the root folder of the deployed template. In generate-anchor.py, we use the anchor path defined in path.py to
4+
create a dummy anchor dataset at the root of the project. Then when a new instance of the template is deployed, this
5+
anchor path is automatically replaced with the path of the anchor dataset in the deployed template. Then to get the
6+
root, we simply remove the name "anchor". Finally, we can use this root path in the rest of the repo. Doing this
7+
allowed us to massively de-duplicate repeated code, in some steps reducing the number of lines of code by more than 90%.
8+
"""
9+
10+
anchor = "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/anchor"
11+
root = anchor[:-len("anchor")]
12+
transform = root + "transform/"
13+
metadata = root + "metadata/"
14+
union_staging = root + "union_staging/"
15+
16+
input_zip = "/UNITE/Data Ingestion & OMOP Mapping/raw_data/Zipped Datasets/site_411_act_raw_zips"
17+
site_id = '/UNITE/Data Ingestion & OMOP Mapping/raw_data/data partner id tables/Data Partner IDs - Site 411'
18+
all_ids = "/UNITE/Data Ingestion & OMOP Mapping/raw_data/data partner id tables/Data Partner IDs - ALL"
19+
mapping = "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Site 411/metadata/n3c_vocab_map"
20+
vocab = "/N3C Export Area/OMOP Vocabularies/vocabulary"
21+
concept = "/N3C Export Area/OMOP Vocabularies/concept"
22+
23+
mapping_overrides = "/UNITE/Data Ingestion & OMOP Mapping/Source Data Model: ACT/Reference Tables/Vocab ID Mapping/act_vocab_id_mapping_table"
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
from pyspark.sql import types as T
2+
3+
4+
complete_domain_schema_dict = {
5+
'concept_dimension': {
6+
'CONCEPT_PATH': T.StringType(),
7+
'CONCEPT_CD': T.StringType(),
8+
'NAME_CHAR': T.StringType(),
9+
'UPDATE_DATE': T.TimestampType(),
10+
'DOWNLOAD_DATE': T.TimestampType(),
11+
'IMPORT_DATE': T.TimestampType(),
12+
'SOURCESYSTEM_CD': T.StringType(),
13+
'UPLOAD_ID': T.IntegerType(),
14+
},
15+
16+
"control_map": {
17+
"CASE_PATID": T.StringType(),
18+
"BUDDY_NUM": T.IntegerType(),
19+
"CONTROL_PATID": T.StringType(),
20+
"CASE_AGE": T.IntegerType(),
21+
"CASE_SEX": T.StringType(),
22+
"CASE_RACE": T.StringType(),
23+
"CASE_ETHN": T.StringType(),
24+
"CONTROL_AGE": T.IntegerType(),
25+
"CONTROL_SEX": T.StringType(),
26+
"CONTROL_RACE": T.StringType(),
27+
"CONTROL_ETHN": T.StringType()
28+
},
29+
30+
"note": {
31+
"NOTE_ID": T.StringType(),
32+
"PERSON_ID": T.LongType(),
33+
"NOTE_DATE": T.DateType(),
34+
"NOTE_DATETIME": T.TimestampType(),
35+
"NOTE_TYPE_CONCEPT_ID": T.IntegerType(),
36+
"NOTE_CLASS_CONCEPT_ID": T.IntegerType(),
37+
"NOTE_TITLE": T.StringType(),
38+
"NOTE_TEXT": T.StringType(),
39+
"ENCODING_CONCEPT_ID": T.IntegerType(),
40+
"LANGUAGE_CONCEPT_ID": T.IntegerType(),
41+
"PROVIDER_ID": T.IntegerType(),
42+
"VISIT_OCCURRENCE_ID": T.IntegerType(),
43+
"VISIT_DETAIL_ID": T.IntegerType(),
44+
"NOTE_SOURCE_VALUE": T.StringType(),
45+
},
46+
47+
"note_nlp": {
48+
"NOTE_NLP_ID": T.LongType(),
49+
"NOTE_ID": T.LongType(),
50+
"SECTION_CONCEPT_ID": T.IntegerType(),
51+
"SNIPPET": T.StringType(),
52+
"OFFSET": T.StringType(),
53+
"LEXICAL_VARIANT": T.StringType(),
54+
"NOTE_NLP_CONCEPT_ID": T.IntegerType(),
55+
"NOTE_NLP_SOURCE_CONCEPT_ID": T.IntegerType(),
56+
"NLP_SYSTEM": T.StringType(),
57+
"NLP_DATE": T.DateType(),
58+
"NLP_DATETIME": T.TimestampType(),
59+
"TERM_EXISTS": T.BooleanType(),
60+
"TERM_TEMPORAL": T.StringType(),
61+
"TERM_MODIFIERS": T.StringType()
62+
},
63+
64+
'observation_fact': {
65+
'ENCOUNTER_NUM': T.StringType(),
66+
'CONCEPT_CD': T.StringType(),
67+
'PROVIDER_ID': T.StringType(),
68+
'START_DATE': T.TimestampType(),
69+
'PATIENT_NUM': T.StringType(),
70+
'MODIFIER_CD': T.StringType(),
71+
'INSTANCE_NUM': T.StringType(),
72+
'VALTYPE_CD': T.StringType(),
73+
'TVAL_CHAR': T.StringType(),
74+
'NVAL_NUM': T.DecimalType(18, 5),
75+
'VALUEFLAG_CD': T.StringType(),
76+
'QUANTITY_NUM': T.DecimalType(18, 5),
77+
'UNITS_CD': T.StringType(),
78+
'END_DATE': T.TimestampType(),
79+
'LOCATION_CD': T.StringType(),
80+
'OBSERVATION_BLOB': T.StringType(),
81+
'CONFIDENCE_NUM': T.DecimalType(18, 5),
82+
'UPDATE_DATE': T.TimestampType(),
83+
'DOWNLOAD_DATE': T.TimestampType(),
84+
'IMPORT_DATE': T.TimestampType(),
85+
'SOURCESYSTEM_CD': T.StringType(),
86+
'UPLOAD_ID': T.IntegerType(),
87+
},
88+
89+
'patient_dimension': {
90+
'PATIENT_NUM': T.StringType(),
91+
'VITAL_STATUS_CD': T.StringType(),
92+
'BIRTH_DATE': T.TimestampType(),
93+
'DEATH_DATE': T.TimestampType(),
94+
'SEX_CD': T.StringType(),
95+
'AGE_IN_YEARS_NUM': T.IntegerType(),
96+
'LANGUAGE_CD': T.StringType(),
97+
'RACE_CD': T.StringType(),
98+
'MARITAL_STATUS_CD': T.StringType(),
99+
'RELIGION_CD': T.StringType(),
100+
'ZIP_CD': T.StringType(),
101+
'STATECITYZIP_PATH': T.StringType(),
102+
'PATIENT_BLOB': T.StringType(),
103+
'UPDATE_DATE': T.TimestampType(),
104+
'DOWNLOAD_DATE': T.TimestampType(),
105+
'IMPORT_DATE': T.TimestampType(),
106+
'SOURCESYSTEM_CD': T.StringType(),
107+
'UPLOAD_ID': T.IntegerType(),
108+
'INCOME_CD': T.StringType(),
109+
"ETHNICITY_CD": T.StringType()
110+
},
111+
112+
'visit_dimension': {
113+
'ENCOUNTER_NUM': T.StringType(),
114+
'PATIENT_NUM': T.StringType(),
115+
'ACTIVE_STATUS_CD': T.StringType(),
116+
'START_DATE': T.TimestampType(),
117+
'END_DATE': T.TimestampType(),
118+
'INOUT_CD': T.StringType(),
119+
'LOCATION_CD': T.StringType(),
120+
'LOCATION_PATH': T.StringType(),
121+
'LENGTH_OF_STAY': T.StringType(),
122+
'VISIT_BLOB': T.StringType(),
123+
'UPDATE_DATE': T.TimestampType(),
124+
'DOWNLOAD_DATE': T.TimestampType(),
125+
'IMPORT_DATE': T.TimestampType(),
126+
'SOURCESYSTEM_CD': T.StringType(),
127+
'UPLOAD_ID': T.IntegerType(),
128+
},
129+
}
130+
131+
required_domain_schema_dict = {
132+
'concept_dimension': {
133+
'CONCEPT_PATH': T.StringType(),
134+
'CONCEPT_CD': T.StringType(),
135+
'NAME_CHAR': T.StringType(),
136+
},
137+
138+
'control_map': {
139+
"CASE_PATID": T.StringType(),
140+
"BUDDY_NUM": T.IntegerType(),
141+
"CONTROL_PATID": T.StringType()
142+
},
143+
144+
'note': {},
145+
146+
'note_nlp': {},
147+
148+
'observation_fact': {
149+
'ENCOUNTER_NUM': T.StringType(),
150+
'CONCEPT_CD': T.StringType(),
151+
'PROVIDER_ID': T.StringType(),
152+
'START_DATE': T.TimestampType(),
153+
'PATIENT_NUM': T.StringType(),
154+
'MODIFIER_CD': T.StringType(),
155+
'INSTANCE_NUM': T.StringType(),
156+
},
157+
158+
'patient_dimension': {
159+
'PATIENT_NUM': T.StringType(),
160+
},
161+
162+
'visit_dimension': {
163+
'ENCOUNTER_NUM': T.StringType(),
164+
'PATIENT_NUM': T.StringType(),
165+
},
166+
}
167+
168+
act_local_code_map_schema = {
169+
"ACT_STANDARD_CODE": T.StringType(),
170+
"LOCAL_CONCEPT_CD": T.StringType(),
171+
"NAME_CHAR": T.StringType(),
172+
"PARENT_CONCEPT_PATH": T.StringType(),
173+
"CONCEPT_PATH": T.StringType(),
174+
"PATH_ELEMENT": T.StringType()
175+
}
176+
177+
data_counts_schema = {
178+
"TABLE_NAME": T.StringType(),
179+
"ROW_COUNT": T.StringType()
180+
}
181+
182+
manifest_schema = {
183+
"SITE_ABBREV": T.StringType(),
184+
"SITE_NAME": T.StringType(),
185+
"CONTACT_NAME": T.StringType(),
186+
"CONTACT_EMAIL": T.StringType(),
187+
"CDM_NAME": T.StringType(),
188+
"CDM_VERSION": T.StringType(),
189+
"VOCABULARY_VERSION": T.StringType(),
190+
"N3C_PHENOTYPE_YN": T.StringType(),
191+
"N3C_PHENOTYPE_VERSION": T.StringType(),
192+
"SHIFT_DATE_YN": T.StringType(),
193+
"MAX_NUM_SHIFT_DAYS": T.StringType(),
194+
"RUN_DATE": T.StringType(),
195+
"UPDATE_DATE": T.StringType(),
196+
"NEXT_SUBMISSION_DATE": T.StringType(),
197+
}
198+
199+
n3c_vocab_map_schema = {
200+
"LOCAL_PREFIX": T.StringType(),
201+
"OMOP_VOCAB": T.StringType()
202+
}
203+
204+
metadata_schemas = {
205+
"act_standard2local_code_map": act_local_code_map_schema,
206+
"data_counts": data_counts_schema,
207+
"manifest": manifest_schema,
208+
"n3c_vocab_map": n3c_vocab_map_schema
209+
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import csv
2+
import tempfile
3+
import shutil
4+
from transforms.api import TransformInput, TransformOutput
5+
from pyspark.sql import Row
6+
from act.act_schemas import complete_domain_schema_dict, schema_dict_to_struct
7+
from act.site_specific_utils import get_site_dialect_params
8+
9+
10+
def parse_input(ctx, my_input: TransformInput, error_df: TransformOutput, site_id: int, domain: str, regex: str):
11+
12+
def process_file(file_status):
13+
# Copy contents of file from Foundry into temp file
14+
with tempfile.NamedTemporaryFile() as t:
15+
with my_input.filesystem().open(file_status.path, 'rb') as f_bytes:
16+
shutil.copyfileobj(f_bytes, t)
17+
t.flush()
18+
19+
# Read the csv, line by line, and use csv.Sniffer to infer the delimiter
20+
# Write any improperly formatted rows to the errors DataFrame
21+
with open(t.name, newline="", encoding="utf8", errors='ignore') as f:
22+
with error_df.filesystem().open('error_rows', 'w', newline='') as writeback:
23+
dialect = csv.Sniffer().sniff(f.read(1024))
24+
f.seek(0)
25+
dialect_params = get_site_dialect_params(site_id, domain)
26+
r = csv.reader(f, delimiter=dialect.delimiter, **dialect_params)
27+
w = csv.writer(writeback)
28+
29+
# Construct a pyspark.Row from our header row
30+
header = next(r)
31+
MyRow = Row(*header)
32+
expected_num_fields = len(header)
33+
34+
error_encountered = False
35+
for i, row in enumerate(r):
36+
if len(row) == expected_num_fields:
37+
# Properly formatted row
38+
yield MyRow(*row)
39+
elif not row:
40+
continue # ignore empty rows/extra newlines
41+
else:
42+
# Improperly formatted row
43+
if not error_encountered:
44+
# Create header for output csv
45+
w.writerow(["row_number", "row_contents"])
46+
error_encountered = True
47+
# Write to a csv file in the errors dataset, recording the row number and malformed row
48+
malformed_row = "|".join(row)
49+
w.writerow([str(i), malformed_row])
50+
51+
files_df = my_input.filesystem().files(regex=regex)
52+
processed_rdd = files_df.rdd.flatMap(process_file)
53+
54+
if processed_rdd.isEmpty():
55+
# Get OrderedDict that specifies this domain's schema
56+
schema_dict = complete_domain_schema_dict[domain]
57+
# Create StructType for the schema with all types as strings
58+
struct_schema = schema_dict_to_struct(schema_dict, all_string_type=True)
59+
# Create empty dataset with proper columns, all string types
60+
processed_df = processed_rdd.toDF(struct_schema)
61+
else:
62+
# csv file for the domain is empty
63+
# Create dataset with whatever columns the site gave us, all string types
64+
processed_df = processed_rdd.toDF()
65+
66+
return processed_df
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
from transforms.api import Pipeline
2+
3+
from act import datasets, anchor
4+
5+
6+
my_pipeline = Pipeline()
7+
my_pipeline.discover_transforms(datasets, anchor)
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from pyspark.sql import functions as F
2+
from pyspark.sql import types as T
3+
from pyspark.sql.window import Window
4+
5+
6+
def new_duplicate_rows_with_collision_bits(omop_domain, lookup_df, ctx, pk_col, full_hash_col):
7+
8+
# Extract all duplicate rows from domain table
9+
# Keep two columns: 51 bit hash (which caused collision) and full hash (to differentiate collisions)
10+
w = Window.partitionBy(pk_col)
11+
duplicates_df = omop_domain.dataframe().select('*', F.count(pk_col).over(w).alias('dupeCount'))\
12+
.where('dupeCount > 1')\
13+
.drop('dupeCount')
14+
duplicates_df = duplicates_df.select(pk_col, full_hash_col)
15+
16+
if ctx.is_incremental:
17+
# Count how many rows in the lookup table exist for the collided hash value
18+
cache = lookup_df.dataframe('previous', schema=T.StructType([
19+
T.StructField(pk_col, T.LongType(), True),
20+
T.StructField(full_hash_col, T.StringType(), True),
21+
T.StructField("collision_bits", T.IntegerType(), True)
22+
]))
23+
cache_count = cache.groupby(pk_col).count()
24+
25+
# Keep only the rows in duplicates_df that are not currently in lookup table
26+
cond = [pk_col, full_hash_col]
27+
duplicates_df = duplicates_df.join(cache, cond, 'left_anti')
28+
29+
# Create counter for rows in duplicates_df
30+
# Subtract 1 because the default collision resolution bit value is 0
31+
w2 = Window.partitionBy(pk_col).orderBy(pk_col)
32+
duplicates_df = duplicates_df.withColumn('row_num', F.row_number().over(w2))
33+
duplicates_df = duplicates_df.withColumn('row_num', (F.col('row_num') - 1))
34+
35+
# If there are already entries in the lookup table for the given primary key,
36+
# then add the number of existing entries to the row number counter
37+
if ctx.is_incremental:
38+
duplicates_df = duplicates_df.join(cache_count, pk_col, 'left')
39+
duplicates_df = duplicates_df.fillna(0, subset=['count'])
40+
duplicates_df = duplicates_df.withColumn('row_num', (F.col('row_num') + F.col('count').cast(T.IntegerType())))
41+
42+
duplicates_df = duplicates_df.withColumnRenamed('row_num', 'collision_bits')
43+
44+
# Remove 'count' column for incremental transforms:
45+
duplicates_df = duplicates_df.select(pk_col, full_hash_col, 'collision_bits')
46+
47+
return duplicates_df

0 commit comments

Comments
 (0)