error at annotation #132

liukeweiaway · 2025-01-23T05:07:17Z

my code: sumstats.plot_mqq(skip=2,build="38",anno="GENENAME", sig_level_lead=1e-16)

error:
BadGzipFile Traceback (most recent call last)
File ~/miniforge3/envs/GWAS/lib/python3.9/site-packages/gtfparse/read_gtf.py:100, in parse_gtf(filepath_or_buffer, chunksize, features, intern_columns, fix_quotes_columns)
99 try:
--> 100 for df in chunk_iterator:
101 for intern_column in intern_columns:

File ~/miniforge3/envs/GWAS/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1841, in TextFileReader.next(self)
1840 try:
-> 1841 return self.get_chunk()
1842 except StopIteration:

File ~/miniforge3/envs/GWAS/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1983, in TextFileReader.get_chunk(self, size)
1982 size = min(size, self.nrows - self._currow)
-> 1983 return self.read(nrows=size)

File ~/miniforge3/envs/GWAS/lib/python3.9/site-packages/pandas/io/parsers/readers.py:1921, in TextFileReader.read(self, nrows)
1915 try:
1916 # error: "ParserBase" has no attribute "read"
1917 (
1918 index,
1919 columns,
1920 col_dict,
-> 1921 ) = self._engine.read( # type: ignore[attr-defined]
1922 nrows
1923 )
1924 except Exception:

File ~/miniforge3/envs/GWAS/lib/python3.9/site-packages/pandas/io/parsers/c_parser_wrapper.py:234, in CParserWrapper.read(self, nrows)
233 if self.low_memory:
--> 234 chunks = self._reader.read_low_memory(nrows)
235 # destructive to chunks

File parsers.pyx:850, in pandas._libs.parsers.TextReader.read_low_memory()

File parsers.pyx:905, in pandas._libs.parsers.TextReader._read_rows()

File parsers.pyx:874, in pandas._libs.parsers.TextReader._tokenize_rows()

File parsers.pyx:891, in pandas._libs.parsers.TextReader._check_tokenize_status()

File parsers.pyx:2053, in pandas._libs.parsers.raise_parser_error()

File ~/miniforge3/envs/GWAS/lib/python3.9/_compression.py:68, in DecompressReader.readinto(self, b)
67 with memoryview(b) as view, view.cast("B") as byte_view:
---> 68 data = self.read(len(byte_view))
69 byte_view[:len(data)] = data

File ~/miniforge3/envs/GWAS/lib/python3.9/gzip.py:478, in _GzipReader.read(self, size)
473 if self._decompressor.eof:
474 # Ending case: we've come to the end of a member in the file,
475 # so finish up this member, and read a new gzip header.
476 # Check the CRC and file size, and set the flag so we read
477 # a new member
--> 478 self._read_eof()
479 self._new_member = True

File ~/miniforge3/envs/GWAS/lib/python3.9/gzip.py:524, in _GzipReader._read_eof(self)
523 if crc32 != self._crc:
--> 524 raise BadGzipFile("CRC check failed %s != %s" % (hex(crc32),
525 hex(self._crc)))
526 elif isize != (self._stream_size & 0xffffffff):

BadGzipFile: CRC check failed 0x837dc8f8 != 0xbaf6464a

During handling of the above exception, another exception occurred:

ParsingError Traceback (most recent call last)
Cell In[88], line 1
----> 1 sumstats.plot_mqq(skip=2,build="38",anno="GENENAME", sig_level_lead=1e-16)

File ~/miniforge3/envs/GWAS/lib/python3.9/site-packages/gwaslab/g_Sumstats.py:648, in Sumstats.plot_mqq(self, build, **kwargs)
645 if build is None:
646 build = self.meta["gwaslab"]["genome_build"]
--> 648 plot = mqqplot(self.data,
649 snpid=snpid,
650 chrom=chrom,
651 pos=pos,
652 p=p,
653 eaf=eaf,
654 build = build,
655 **kwargs)
657 return plot

File ~/miniforge3/envs/GWAS/lib/python3.9/site-packages/gwaslab/viz_plot_mqqplot.py:896, in mqqplot(insumstats, chrom, pos, p, snpid, eaf, ea, nea, check, chr_dict, xtick_chr_dict, vcf_path, vcf_chr_dict, gtf_path, gtf_chr_dict, gtf_gene_name, rr_path, rr_header_dict, rr_chr_dict, rr_lim, rr_ylabel, mlog10p, scaled, mode, scatter_args, scatterargs, qq_scatter_args, qqscatterargs, qq_line_color, region, region_title, region_title_args, region_ref, region_ref2, region_ref_second, region_step, region_grid, region_grid_line, region_lead_grid, region_lead_grid_line, region_hspace, region_ld_threshold, region_ld_legend, region_ld_colors, region_ld_colors_m, region_recombination, region_protein_coding, region_flank_factor, region_anno_bbox_args, region_marker_shapes, region_legend_marker, cbar_title, cbar_fontsize, cbar_font_family, track_n, track_n_offset, track_fontsize_ratio, track_exon_ratio, track_text_offset, track_font_family, taf, tabix, mqqratio, bwindowsizekb, density_color, density_range, density_trange, density_threshold, density_tpalette, density_palette, windowsizekb, anno, anno_set, anno_alias, anno_d, anno_args, anno_style, anno_fixed_arm_length, anno_source, anno_gtf_path, anno_adjust, anno_xshift, anno_max_iter, arrow_kwargs, arm_offset, arm_scale, anno_height, arm_scale_d, cut, skip, ystep, ylabels, ytick3, cutfactor, cut_line_color, cut_log, jagged, jagged_len, jagged_wid, sig_line, sig_level, sig_level_plot, sig_level_lead, sig_line_color, suggestive_sig_line, suggestive_sig_level, suggestive_sig_line_color, additional_line, additional_line_color, sc_linewidth, highlight, highlight_chrpos, highlight_color, highlight_windowkb, highlight_anno_args, pinpoint, pinpoint_color, stratified, maf_bins, maf_bin_colors, gc, include_chrXYMT, ylim, xpad, xpadl, xpadr, xtight, chrpad, drop_chr_start, title, mtitle, qtitle, ylabel, xlabel, title_pad, title_fontsize, fontsize, font_family, anno_fontsize, figargs, fig_args, figax, colors, marker_size, use_rank, verbose, repel_force, build, _posdiccul, dpi, save, save_args, saveargs, _invert, _chrom_df_for_i, _if_quick_qc, _get_region_lead, expected_min_mlog10p, log)
894 log.write(" -Found "+str(len(to_annotate))+" significant variants with a sliding window size of "+str(windowsizekb)+" kb...",verbose=verbose)
895 if (to_annotate.empty is not True) and anno=="GENENAME":
--> 896 to_annotate = annogene(to_annotate,
897 id=snpid,
898 chrom=chrom,
899 pos=pos,
900 log=log,
901 build=build,
902 source=anno_source,
903 gtf_path=anno_gtf_path,
904 verbose=verbose).rename(columns={"GENE":"Annotation"})
905 log.write("Finished extracting variants for annotation...",verbose=verbose)
907 # Configure X, Y axes #######################################################

File ~/miniforge3/envs/GWAS/lib/python3.9/site-packages/gwaslab/util_in_get_sig.py:308, in annogene(insumstats, id, chrom, pos, log, xymt, build, source, gtf_path, verbose)
303 data = Genome(
304 reference_name='GRCh38',
305 annotation_name='Ensembl',
306 gtf_path_or_url=gtf_path)
307 if path.isfile(gtf_db_path) is False:
--> 308 data.index()
309 output.loc[:,["LOCATION","GENE"]] = pd.DataFrame(
310 list(output.apply(lambda x:closest_gene(x,data=data,chrom=chrom,pos=pos,source=source), axis=1)),
311 index=output.index).values
313 if source == "refseq":

File ~/miniforge3/envs/GWAS/lib/python3.9/site-packages/pyensembl/genome.py:273, in Genome.index(self, overwrite)
267 """
268 Assuming that all necessary data for this Genome has been downloaded,
269 generate the GTF database and save efficient representation of
270 FASTA sequence files.
271 """
272 if self.requires_gtf:
--> 273 self.db.connect_or_create(overwrite=overwrite)
274 if self.requires_transcript_fasta:
275 self.transcript_sequences.index(overwrite=overwrite)

File ~/miniforge3/envs/GWAS/lib/python3.9/site-packages/pyensembl/database.py:290, in Database.connect_or_create(self, overwrite)
288 return connection
289 else:
--> 290 return self.create(overwrite=overwrite)

File ~/miniforge3/envs/GWAS/lib/python3.9/site-packages/pyensembl/database.py:212, in Database.create(self, overwrite)
209 logger.info("Creating database: %s", self.local_db_path)
210 datacache.ensure_dir(self.cache_directory_path)
--> 212 df = self._load_gtf_as_dataframe(
213 usecols=self.restrict_gtf_columns,
214 features=self.restrict_gtf_features)
215 all_index_groups = self._all_possible_indices(df.columns)
217 if self.restrict_gtf_features:

File ~/miniforge3/envs/GWAS/lib/python3.9/site-packages/pyensembl/database.py:604, in Database._load_gtf_as_dataframe(self, usecols, features)
600 """
601 Parse this genome source's GTF file and load it as a Pandas DataFrame
602 """
603 logger.info("Reading GTF from %s", self.gtf_path)
--> 604 df = read_gtf(
605 self.gtf_path,
606 column_converters={
607 "seqname": normalize_chromosome,
608 "strand": normalize_strand,
609 },
610 infer_biotype_column=True,
611 usecols=usecols,
612 features=features)
614 column_names = set(df.keys())
615 expect_gene_feature = features is None or "gene" in features

File ~/miniforge3/envs/GWAS/lib/python3.9/site-packages/gtfparse/read_gtf.py:205, in read_gtf(filepath_or_buffer, expand_attribute_column, infer_biotype_column, column_converters, usecols, features, chunksize)
202 raise ValueError("GTF file does not exist: %s" % filepath_or_buffer)
204 if expand_attribute_column:
--> 205 result_df = parse_gtf_and_expand_attributes(
206 filepath_or_buffer,
207 chunksize=chunksize,
208 restrict_attribute_columns=usecols,
209 features=features)
210 else:
211 result_df = parse_gtf(result_df, features=features)

File ~/miniforge3/envs/GWAS/lib/python3.9/site-packages/gtfparse/read_gtf.py:148, in parse_gtf_and_expand_attributes(filepath_or_buffer, chunksize, restrict_attribute_columns, features)
124 def parse_gtf_and_expand_attributes(
125 filepath_or_buffer,
126 chunksize=1024 * 1024,
127 restrict_attribute_columns=None,
128 features=None):
129 """
130 Parse lines into column->values dictionary and then expand
131 the 'attribute' column into multiple columns. This expansion happens
(...)
146 Ignore entries which don't correspond to one of the supplied features
147 """
--> 148 result = parse_gtf(
149 filepath_or_buffer,
150 chunksize=chunksize,
151 features=features)
152 attribute_values = result["attribute"]
153 del result["attribute"]

File ~/miniforge3/envs/GWAS/lib/python3.9/site-packages/gtfparse/read_gtf.py:119, in parse_gtf(filepath_or_buffer, chunksize, features, intern_columns, fix_quotes_columns)
117 dataframes.append(df)
118 except Exception as e:
--> 119 raise ParsingError(str(e))
120 df = pd.concat(dataframes)
121 return df

ParsingError: CRC check failed 0x837dc8f8 != 0xbaf6464a

Cloufield · 2025-01-24T06:51:23Z

Hi,
based on the error message "BadGzipFile Traceback (most recent call last)" and "ParsingError: CRC check failed ", it seems that the downloaded gzipped GTF file might be corrupted. Could you delete that file and rerun the function? The default directory for reference files is~/.gwaslab

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

error at annotation #132

error at annotation #132

liukeweiaway commented Jan 23, 2025

Cloufield commented Jan 24, 2025

error at annotation #132

error at annotation #132

Comments

liukeweiaway commented Jan 23, 2025

Cloufield commented Jan 24, 2025