Skip to content

Commit

Permalink
Trying to prevent gene id getting appended at end of modified gtf
Browse files Browse the repository at this point in the history
  • Loading branch information
j-berg committed Jul 1, 2024
1 parent 8a5342d commit 098183d
Showing 1 changed file with 23 additions and 21 deletions.
44 changes: 23 additions & 21 deletions xpresspipe/gtfModify.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def get_chunks(gtf, threads=None, identifier='gene_id \"'):
for y in range(cores):
# If the last chunk, get the remainder of the GTF dataframe
if y == cores - 1:
new_chunk = gtf.loc[start:]
new_chunk = gtf.loc[start:].copy()
else:
end = start + batch # Set tentative end of next chunk

Expand All @@ -277,7 +277,7 @@ def get_chunks(gtf, threads=None, identifier='gene_id \"'):
gtf_remainder = gtf.iloc[start:end]

n = -1 # Start at current record
gene_id_original = gtf_remainder.at[end + n, gtf_annotation_column][(gtf_remainder.at[end + n, gtf_annotation_column].find('gene_id \"') + 9):].split('\";')[0]
gene_id_original = gtf_remainder.at[end + n, gtf_annotation_column].split('gene_id \"')[1].split('\";')[0]
gene_id_next = gene_id_original

amount = -1 # Set scan direction as reverse
Expand All @@ -290,13 +290,13 @@ def get_chunks(gtf, threads=None, identifier='gene_id \"'):
n += amount # Take another step back until a new gene_id is found (or forward if hit the start)
if end + n >= len(gtf.index): # Check if the index exceeds the dataframe length
break
gene_id_next = gtf_remainder.at[end + n, gtf_annotation_column][(gtf_remainder.at[end + n, gtf_annotation_column].find('gene_id \"') + 9):].split('\";')[0]
gene_id_next = gtf_remainder.at[end + n, gtf_annotation_column].split('gene_id \"')[1].split('\";')[0]

if amount == 1: # Correct for scan direction pivot
n -= 1

# Parse out current chunk
new_chunk = gtf.loc[start:end + n]
new_chunk = gtf.loc[start:end + n].copy()
start = end + n + 1 # End coordinate for last chunk to start with next

if not new_chunk.empty:
Expand Down Expand Up @@ -348,7 +348,7 @@ def run_chunks(

"""Run all GTF-editing functions"""
def edit_gtf(
gtf, # Dataframe of file path and name to GTF reference
gtf, # File path and name to GTF reference or DataFrame
longest_transcript=True,
protein_coding=True,
truncate_reference=True,
Expand All @@ -365,7 +365,7 @@ def edit_gtf(
output = False # Turn off intermediates output
file_name = None
elif str(gtf).endswith('.gtf'):
file_name = str(gtf[:-gtf_rightCoordinate_column]) + '_' # Get rid of GTF extension for now
file_name = str(gtf[:-4]) + '_' # Get rid of GTF extension for now

# Read the file and store comments
with open(str(gtf), 'r') as f:
Expand All @@ -388,7 +388,7 @@ def edit_gtf(
# Get chunks
chunks = get_chunks(
gtf,
threads = threads)
threads=threads)

gtf = None # Garbage management
gc.collect()
Expand All @@ -399,7 +399,7 @@ def edit_gtf(
chunks = run_chunks(
longest_transcripts,
chunks,
target_message = 'longest transcripts')
target_message='longest transcripts')

if output == True:
file_name = str(file_name) + 'L'
Expand All @@ -410,17 +410,16 @@ def edit_gtf(

# Get only protein coding annotated records
if protein_coding == True:

if ucsc_formatted == True:
chunks = run_chunks(
get_ucsc_proteins,
chunks,
target_message = 'protein coding genes')
target_message='protein coding genes')
else:
chunks = run_chunks(
protein_gtf,
chunks,
target_message = 'protein coding genes')
target_message='protein coding genes')

if output == True:
file_name = str(file_name) + 'C'
Expand All @@ -430,13 +429,13 @@ def edit_gtf(
if truncate_reference == True:
func = partial(
truncate_gtf,
_5prime = _5prime,
_3prime = _3prime,
ucsc_formatted = ucsc_formatted)
_5prime=_5prime,
_3prime=_3prime,
ucsc_formatted=ucsc_formatted)
chunks = run_chunks(
func,
chunks,
target_message = 'truncation')
target_message='truncation')

if output == True:
file_name = str(file_name) + 'T'
Expand All @@ -446,6 +445,9 @@ def edit_gtf(
gtf = pd.concat(chunks)
gtf = gtf.reset_index(drop=True)

# Remove any extra columns (keeping only the first 9 columns)
gtf = gtf.iloc[:, :9]

chunks = None # Garbage management
gc.collect()

Expand All @@ -458,14 +460,14 @@ def edit_gtf(
# Append the modified GTF data
gtf.to_csv(
str(file_name) + '.gtf',
sep = '\t',
header = None,
index = False,
quoting = csv.QUOTE_NONE,
sep='\t',
header=None,
index=False,
quoting=csv.QUOTE_NONE,
mode='a' # Append mode
)
else:
return gtf
return gtf

else:
raise Warning('0 chunks of the original file remain')
raise Warning('0 chunks of the original file remain')

0 comments on commit 098183d

Please sign in to comment.