diff --git a/CGATPipelines/pipeline_docs/pipeline_readqc/contents.rst b/CGATPipelines/pipeline_docs/pipeline_readqc/contents.rst index 0cc63fb3..bb66e30a 100644 --- a/CGATPipelines/pipeline_docs/pipeline_readqc/contents.rst +++ b/CGATPipelines/pipeline_docs/pipeline_readqc/contents.rst @@ -21,7 +21,10 @@ for the ReadQC Pipeline is below: pipeline/SequenceDuplicationLevels.rst pipeline/Filtering.rst pipeline/SequenceLength.rst - pipeline/BiasAnalysis.rst + pipeline/BiasAnalysisLinear.rst + pipeline/BiasAnalysisFirst.rst + pipeline/BiasAnalysisSecond.rst + pipeline/BiasAnalysisThird.rst python/Trackers.rst .. automodule:: pipeline_readqc diff --git a/CGATPipelines/pipeline_docs/pipeline_readqc/pipeline/BiasAnalysisFirst.rst b/CGATPipelines/pipeline_docs/pipeline_readqc/pipeline/BiasAnalysisFirst.rst new file mode 100644 index 00000000..e8d971c5 --- /dev/null +++ b/CGATPipelines/pipeline_docs/pipeline_readqc/pipeline/BiasAnalysisFirst.rst @@ -0,0 +1,684 @@ +===================================================================== +Bias analysis results - Split by first identifier +===================================================================== + +This page presents the analysis of potential biasing factors using +linear regression. The plot aesthetics are split by the first +identier, e.g tissue. + +Genes/transcripts are binned according to their value for each +potential biasing factor (e.g GC content), with each bin containing an +equal number of genes/transcripts. The mean expression for the +genes/transcripts for each sample is then calculated for each +bin. This mean expression is plotted below, along with a linear +regression for each sample. + + +.. AH: Removed toframe transformations for CGATReport. +GC content plots +================ + +.. report:: Status.GCContentSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GC_Content, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GC Content (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned GC content for each sample. Linear regression. + +.. report:: Status.GCContentSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GC_Content, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GC Content (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned GC content for each sample. Local + regression. + + +Length plots +============ + +.. report:: Status.LengthSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=length, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('length (Log 2 bp)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned length for each sample. Linear regression. + +.. report:: Status.LengthSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=length, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('length (Log 2 bp)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned length for each sample. Local + regression. + + +AA dinucleotide plots +===================== + +.. report:: Status.AASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AA, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AA dinucleotides for each + sample. Linear regression. + +.. report:: Status.AASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AA, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AA dinucleotides for each + sample. Local regression. + + +AT dinucleotide plots +===================== + +.. report:: Status.ATSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AT, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AT dinucleotides for each + sample. Linear regression. + +.. report:: Status.ATSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AT, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AT dinucleotides for each + sample. Local regression. + + +AC dinucleotide plots +===================== + +.. report:: Status.ACSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AC, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AC dinucleotides for each + sample. Linear regression. + +.. report:: Status.ACSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AC, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AC dinucleotides for each + sample. Local regression. + +AG dinucleotide plots +===================== + +.. report:: Status.AGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AG, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AG dinucleotides for each + sample. Linear regression. + +.. report:: Status.AGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AG, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AG dinucleotides for each + sample. Local regression. + +TA dinucleotide plots +===================== + +.. report:: Status.TASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TA, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TA dinucleotides for each + sample. Linear regression. + +.. report:: Status.TASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TA, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TA dinucleotides for each + sample. Local regression. + +TT dinucleotide plots +===================== + +.. report:: Status.TTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TT, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TT dinucleotides for each + sample. Linear regression. + +.. report:: Status.TTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TT, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TT dinucleotides for each + sample. Local regression. + +TC dinucleotide plots +===================== + +.. report:: Status.TCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TC, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TC dinucleotides for each + sample. Linear regression. + +.. report:: Status.TCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TC, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TC dinucleotides for each + sample. Local regression. + +TG dinucleotide plots +===================== + +.. report:: Status.TGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TG, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TG dinucleotides for each + sample. Linear regression. + +.. report:: Status.TGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TG, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TG dinucleotides for each + sample. Local regression. + +CA dinucleotide plots +===================== + +.. report:: Status.CASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CA, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CA dinucleotides for each + sample. Linear regression. + +.. report:: Status.CASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CA, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CA dinucleotides for each + sample. Local regression. + +CT dinucleotide plots +===================== + +.. report:: Status.CTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CT, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CT dinucleotides for each + sample. Linear regression. + +.. report:: Status.CTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CT, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CT dinucleotides for each + sample. Local regression. + +CC dinucleotide plots +===================== + +.. report:: Status.CCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CC, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CC dinucleotides for each + sample. Linear regression. + +.. report:: Status.CCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CC, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CC dinucleotides for each + sample. Local regression. + +CG dinucleotide plots +===================== + +.. report:: Status.CGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CG, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CG dinucleotides for each + sample. Linear regression. + +.. report:: Status.CGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CG, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CG dinucleotides for each + sample. Local regression. + +GA dinucleotide plots +===================== + +.. report:: Status.GASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GA, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GA dinucleotides for each + sample. Linear regression. + +.. report:: Status.GASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GA, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GA dinucleotides for each + sample. Local regression. + +GT dinucleotide plots +===================== + +.. report:: Status.GTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GT, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GT dinucleotides for each + sample. Linear regression. + +.. report:: Status.GTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GT, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GT dinucleotides for each + sample. Local regression. + +GC dinucleotide plots +===================== + +.. report:: Status.GCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GC, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GC dinucleotides for each + sample. Linear regression. + +.. report:: Status.GCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GC, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GC dinucleotides for each + sample. Local regression. + +GG dinucleotide plots +===================== + +.. report:: Status.GGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GG, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GG dinucleotides for each + sample. Linear regression. + +.. report:: Status.GGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GG, colour=id_1)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_1),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='First Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GG dinucleotides for each + sample. Local regression. diff --git a/CGATPipelines/pipeline_docs/pipeline_readqc/pipeline/BiasAnalysisLinear.rst b/CGATPipelines/pipeline_docs/pipeline_readqc/pipeline/BiasAnalysisLinear.rst new file mode 100644 index 00000000..d55af0cc --- /dev/null +++ b/CGATPipelines/pipeline_docs/pipeline_readqc/pipeline/BiasAnalysisLinear.rst @@ -0,0 +1,54 @@ +=================================================== +Bias analysis results - linear regression - Summary +=================================================== + +This page presents an overview of the analysis of potential biasing +factors using linear regression. The plot aesthetics are split by the +first identier, e.g tissue. + +Genes/transcripts are binned according to their value for each +potential biasing factor (e.g GC content), with each bin containing an +equal number of genes/transcripts. The mean expression for the +genes/transcripts for each sample is then calculated for each bin. The +relationship between the potential biasing factor and expression level +is explored by computing the Spearman rank correlation and linear +regression. The gradient of the linear regression and rho value of the +correlation are shown below + + +.. AH: Removed toframe transformations for CGATReport. + + +Summary plots +========================= + +.. report:: ReadqcReport.CorrelationSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=as.factor(sample), + colour=as.factor(variable), group=as.factor(variable))+geom_line()+ + scale_colour_discrete(name=guide_legend(title='biasfactor'))+ + xlab('')+ylab('Correlation')+ + theme(axis.text.x=element_text(size=15,hjust=1,angle=90), + axis.text.y=element_text(size=15),title=element_text(size=15), + legend.text=element_text(size=15)) + + + Correlation between gene expression and potential biasing factors + across all samples. + +.. report:: ReadqcReport.GradientSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=as.factor(sample), + colour=as.factor(variable), group=as.factor(variable))+ + geom_line()+ + scale_colour_discrete(name = guide_legend(title='biasfactor'))+ + xlab('')+ + ylab('Gradient')+ + theme(axis.text.x=element_text(size=15,angle=90,hjust=1), + axis.text.y=element_text(size=15),title=element_text(size=15), + legend.text=element_text(size=15)) + + Gradient of linear regression between gene expression and potential + biasing factors across all samples. + + diff --git a/CGATPipelines/pipeline_docs/pipeline_readqc/pipeline/BiasAnalysisSecond.rst b/CGATPipelines/pipeline_docs/pipeline_readqc/pipeline/BiasAnalysisSecond.rst new file mode 100644 index 00000000..0626c015 --- /dev/null +++ b/CGATPipelines/pipeline_docs/pipeline_readqc/pipeline/BiasAnalysisSecond.rst @@ -0,0 +1,684 @@ +===================================================================== +Bias analysis results - Split by second identifier +===================================================================== + +This page presents the analysis of potential biasing factors using +linear regression. The plot aesthetics are split by the second +identifer, e.g treatment + +Genes/transcripts are binned according to their value for each +potential biasing factor (e.g GC content), with each bin containing an +equal number of genes/transcripts. The mean expression for the +genes/transcripts for each sample is then calculated for each +bin. This mean expression is plotted below, along with a linear +regression for each sample. + + +GC content plots +================ + +.. report:: Status.GCContentSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GC_Content, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GC Content (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned GC content for each sample. Linear regression. + +.. report:: Status.GCContentSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GC_Content, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GC Content (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned GC content for each sample. Local + regression. + + +Length plots +============ + +.. report:: Status.LengthSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=length, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('length (Log 2 bp)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned length for each sample. Linear regression. + +.. report:: Status.LengthSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=length, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('length (Log 2 bp)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned length for each sample. Local + regression. + + +AA dinucleotide plots +===================== + +.. report:: Status.AASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AA, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AA dinucleotides for each + sample. Linear regression. + +.. report:: Status.AASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AA, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AA dinucleotides for each + sample. Local regression. + + +AT dinucleotide plots +===================== + +.. report:: Status.ATSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AT, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AT dinucleotides for each + sample. Linear regression. + +.. report:: Status.ATSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AT, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AT dinucleotides for each + sample. Local regression. + + +AC dinucleotide plots +===================== + +.. report:: Status.ACSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AC, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AC dinucleotides for each + sample. Linear regression. + +.. report:: Status.ACSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AC, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AC dinucleotides for each + sample. Local regression. + +AG dinucleotide plots +===================== + +.. report:: Status.AGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AG, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AG dinucleotides for each + sample. Linear regression. + +.. report:: Status.AGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AG, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AG dinucleotides for each + sample. Local regression. + +TA dinucleotide plots +===================== + +.. report:: Status.TASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TA, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TA dinucleotides for each + sample. Linear regression. + +.. report:: Status.TASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TA, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TA dinucleotides for each + sample. Local regression. + +TT dinucleotide plots +===================== + +.. report:: Status.TTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TT, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TT dinucleotides for each + sample. Linear regression. + +.. report:: Status.TTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TT, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TT dinucleotides for each + sample. Local regression. + +TC dinucleotide plots +===================== + +.. report:: Status.TCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TC, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TC dinucleotides for each + sample. Linear regression. + +.. report:: Status.TCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TC, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TC dinucleotides for each + sample. Local regression. + +TG dinucleotide plots +===================== + +.. report:: Status.TGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TG, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TG dinucleotides for each + sample. Linear regression. + +.. report:: Status.TGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TG, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TG dinucleotides for each + sample. Local regression. + +CA dinucleotide plots +===================== + +.. report:: Status.CASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CA, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CA dinucleotides for each + sample. Linear regression. + +.. report:: Status.CASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CA, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CA dinucleotides for each + sample. Local regression. + +CT dinucleotide plots +===================== + +.. report:: Status.CTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CT, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CT dinucleotides for each + sample. Linear regression. + +.. report:: Status.CTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CT, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CT dinucleotides for each + sample. Local regression. + +CC dinucleotide plots +===================== + +.. report:: Status.CCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CC, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CC dinucleotides for each + sample. Linear regression. + +.. report:: Status.CCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CC, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CC dinucleotides for each + sample. Local regression. + +CG dinucleotide plots +===================== + +.. report:: Status.CGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CG, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CG dinucleotides for each + sample. Linear regression. + +.. report:: Status.CGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CG, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CG dinucleotides for each + sample. Local regression. + +GA dinucleotide plots +===================== + +.. report:: Status.GASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GA, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GA dinucleotides for each + sample. Linear regression. + +.. report:: Status.GASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GA, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GA dinucleotides for each + sample. Local regression. + +GT dinucleotide plots +===================== + +.. report:: Status.GTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GT, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GT dinucleotides for each + sample. Linear regression. + +.. report:: Status.GTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GT, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GT dinucleotides for each + sample. Local regression. + +GC dinucleotide plots +===================== + +.. report:: Status.GCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GC, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GC dinucleotides for each + sample. Linear regression. + +.. report:: Status.GCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GC, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GC dinucleotides for each + sample. Local regression. + +GG dinucleotide plots +===================== + +.. report:: Status.GGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GG, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GG dinucleotides for each + sample. Linear regression. + +.. report:: Status.GGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GG, colour=id_2)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_2),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Second Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GG dinucleotides for each + sample. Local regression. + diff --git a/CGATPipelines/pipeline_docs/pipeline_readqc/pipeline/BiasAnalysisThird.rst b/CGATPipelines/pipeline_docs/pipeline_readqc/pipeline/BiasAnalysisThird.rst new file mode 100644 index 00000000..7d86c0d5 --- /dev/null +++ b/CGATPipelines/pipeline_docs/pipeline_readqc/pipeline/BiasAnalysisThird.rst @@ -0,0 +1,683 @@ +===================================================================== +Bias analysis results - Split by third identifier +===================================================================== + +This page presents the analysis of potential biasing factors using +linear regression. The plot aesthetics are split by the first +identier, e.g tissue. + +Genes/transcripts are binned according to their value for each +potential biasing factor (e.g GC content), with each bin containing an +equal number of genes/transcripts. The mean expression for the +genes/transcripts for each sample is then calculated for each +bin. This mean expression is plotted below, along with a linear +regression for each sample. + + +GC content plots +================ + +.. report:: Status.GCContentSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GC_Content, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GC Content (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned GC content for each sample. Linear regression. + +.. report:: Status.GCContentSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GC_Content, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GC Content (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned GC content for each sample. Local + regression. + + +Length plots +============ + +.. report:: Status.LengthSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=length, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('length (Log 2 bp)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned length for each sample. Linear regression. + +.. report:: Status.LengthSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=length, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('length (Log 2 bp)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned length for each sample. Local + regression. + + +AA dinucleotide plots +===================== + +.. report:: Status.AASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AA, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AA dinucleotides for each + sample. Linear regression. + +.. report:: Status.AASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AA, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AA dinucleotides for each + sample. Local regression. + + +AT dinucleotide plots +===================== + +.. report:: Status.ATSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AT, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AT dinucleotides for each + sample. Linear regression. + +.. report:: Status.ATSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AT, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AT dinucleotides for each + sample. Local regression. + + +AC dinucleotide plots +===================== + +.. report:: Status.ACSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AC, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AC dinucleotides for each + sample. Linear regression. + +.. report:: Status.ACSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AC, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AC dinucleotides for each + sample. Local regression. + +AG dinucleotide plots +===================== + +.. report:: Status.AGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AG, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AG dinucleotides for each + sample. Linear regression. + +.. report:: Status.AGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=AG, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('AG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage AG dinucleotides for each + sample. Local regression. + +TA dinucleotide plots +===================== + +.. report:: Status.TASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TA, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TA dinucleotides for each + sample. Linear regression. + +.. report:: Status.TASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TA, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TA dinucleotides for each + sample. Local regression. + +TT dinucleotide plots +===================== + +.. report:: Status.TTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TT, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TT dinucleotides for each + sample. Linear regression. + +.. report:: Status.TTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TT, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TT dinucleotides for each + sample. Local regression. + +TC dinucleotide plots +===================== + +.. report:: Status.TCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TC, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TC dinucleotides for each + sample. Linear regression. + +.. report:: Status.TCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TC, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TC dinucleotides for each + sample. Local regression. + +TG dinucleotide plots +===================== + +.. report:: Status.TGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TG, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TG dinucleotides for each + sample. Linear regression. + +.. report:: Status.TGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=TG, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('TG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage TG dinucleotides for each + sample. Local regression. + +CA dinucleotide plots +===================== + +.. report:: Status.CASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CA, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CA dinucleotides for each + sample. Linear regression. + +.. report:: Status.CASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CA, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CA dinucleotides for each + sample. Local regression. + +CT dinucleotide plots +===================== + +.. report:: Status.CTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CT, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CT dinucleotides for each + sample. Linear regression. + +.. report:: Status.CTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CT, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CT dinucleotides for each + sample. Local regression. + +CC dinucleotide plots +===================== + +.. report:: Status.CCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CC, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CC dinucleotides for each + sample. Linear regression. + +.. report:: Status.CCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CC, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CC dinucleotides for each + sample. Local regression. + +CG dinucleotide plots +===================== + +.. report:: Status.CGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CG, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CG dinucleotides for each + sample. Linear regression. + +.. report:: Status.CGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=CG, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('CG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage CG dinucleotides for each + sample. Local regression. + +GA dinucleotide plots +===================== + +.. report:: Status.GASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GA, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GA dinucleotides for each + sample. Linear regression. + +.. report:: Status.GASummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GA, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GA (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GA dinucleotides for each + sample. Local regression. + +GT dinucleotide plots +===================== + +.. report:: Status.GTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GT, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GT dinucleotides for each + sample. Linear regression. + +.. report:: Status.GTSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GT, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GT (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GT dinucleotides for each + sample. Local regression. + +GC dinucleotide plots +===================== + +.. report:: Status.GCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GC, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GC dinucleotides for each + sample. Linear regression. + +.. report:: Status.GCSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GC, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GC (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GC dinucleotides for each + sample. Local regression. + +GG dinucleotide plots +===================== + +.. report:: Status.GGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GG, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=lm,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GG dinucleotides for each + sample. Linear regression. + +.. report:: Status.GGSummary + :render: r-ggplot + :statement: aes(y=as.numeric(value), x=GG, colour=id_3)+ + geom_point()+ + stat_smooth(aes(group=variable,colour=id_3),method=loess,se=F)+ + scale_colour_discrete(name=guide_legend(title='Third Identifier'))+ + scale_y_continuous(limits=c(0,1))+ + xlab('GG (Fraction)')+ + ylab('Normalised Expression(Nominal scale)')+ + theme(axis.text.x=element_text(size=10,angle=90), + axis.text.y=element_text(size=15), + title=element_text(size=15), + legend.text=element_text(size=15)) + + Mean expression across binned percentage GG dinucleotides for each + sample. Local regression. diff --git a/CGATPipelines/pipeline_docs/pipeline_readqc/trackers/ReadqcReport.py b/CGATPipelines/pipeline_docs/pipeline_readqc/trackers/ReadqcReport.py index 75b4a4f5..b2bb4fce 100644 --- a/CGATPipelines/pipeline_docs/pipeline_readqc/trackers/ReadqcReport.py +++ b/CGATPipelines/pipeline_docs/pipeline_readqc/trackers/ReadqcReport.py @@ -4,7 +4,8 @@ import types import itertools import glob - +import pandas as pd +import numpy as np from CGATReport.Tracker import * from CGATReport.Utils import PARAMS as P from collections import OrderedDict as odict @@ -165,101 +166,135 @@ class ProcessingSummary(ReadqcTracker, SingleTableTrackerRows): table = "processing_summary" -class CorrelationSummary(ReadqcTracker, SingleTableTrackerRows): +class CorrelationSummary(ReadqcTracker): table = "binned_means_correlation" - fields = ("sample",) + def getTracks(self, subset=None): + return ("all") -class GradientSummary(ReadqcTracker, SingleTableTrackerRows): + def __call__(self, track, slice=None): + statement = ("SELECT * FROM %(table)s") + # fetch data + df = pd.DataFrame.from_dict(self.getAll(statement)) + df['sample'] = map(lambda x: re.sub("_quant.sf", "", x), df['sample']) + df = pd.melt(df, id_vars="sample") + df2 = pd.DataFrame(map(lambda x: x.split("-"), df['sample'])) + df2.columns = ["id_"+str(x) for x in range(1, len(df2.columns)+1)] + merged = pd.concat([df, df2], axis=1) + return merged + + +class GradientSummary(CorrelationSummary): table = "binned_means_gradients" - fields = ("sample",) -class GCContentSummary(ReadqcTracker, SingleTableTrackerRows): +class BiasFactorPlot(ReadqcTracker): + table = "" + factor = "" + + def getTracks(self, subset=None): + return ("all") + + def __call__(self, track, slice=None): + statement = ("SELECT * FROM %(table)s") + # fetch data + df = pd.DataFrame.from_dict(self.getAll(statement)) + df = pd.melt(df, id_vars=self.factor) + df['variable'] = map(lambda x: re.sub("_quant.sf", "", x), + df['variable']) + df['value'] = ((df['value'] - min(df['value'])) / + (max(df['value'])-min(df['value']))) + df2 = pd.DataFrame(map(lambda x: x.split("_"), df['variable'])) + df2.columns = ["id_"+str(x) for x in range(1, len(df2.columns)+1)] + merged = pd.concat([df, df2], axis=1) + return merged + + +class GCContentSummary(BiasFactorPlot): table = "means_binned_GC_Content" - fields = ("GC_Content",) + factor = "GC_Content" -class LengthSummary(ReadqcTracker, SingleTableTrackerRows): +class LengthSummary(BiasFactorPlot): table = "means_binned_length" - fields = ("length",) + factor = "length" -class AASummary(ReadqcTracker, SingleTableTrackerRows): +class AASummary(BiasFactorPlot): table = "means_binned_AA" - fields = ("AA",) + factor = "AA" -class ATSummary(ReadqcTracker, SingleTableTrackerRows): +class ATSummary(BiasFactorPlot): table = "means_binned_AT" - fields = ("AT",) + factor = "AT" -class ACSummary(ReadqcTracker, SingleTableTrackerRows): +class ACSummary(BiasFactorPlot): table = "means_binned_AC" - fields = ("AC",) + factor = "AC" -class AGSummary(ReadqcTracker, SingleTableTrackerRows): +class AGSummary(BiasFactorPlot): table = "means_binned_AG" - fields = ("AG",) + factor = "AG" -class TASummary(ReadqcTracker, SingleTableTrackerRows): +class TASummary(BiasFactorPlot): table = "means_binned_TA" - fields = ("TA",) + factor = "TA" -class TTSummary(ReadqcTracker, SingleTableTrackerRows): +class TTSummary(BiasFactorPlot): table = "means_binned_TT" - fields = ("TT",) + factor = "TT" -class TCSummary(ReadqcTracker, SingleTableTrackerRows): +class TCSummary(BiasFactorPlot): table = "means_binned_TC" - fields = ("TC",) + factor = "TC" -class TGSummary(ReadqcTracker, SingleTableTrackerRows): +class TGSummary(BiasFactorPlot): table = "means_binned_TG" - fields = ("TG",) + factor = "TG" -class CASummary(ReadqcTracker, SingleTableTrackerRows): +class CASummary(BiasFactorPlot): table = "means_binned_CA" - fields = ("CA",) + factor = "CA" -class CTSummary(ReadqcTracker, SingleTableTrackerRows): +class CTSummary(BiasFactorPlot): table = "means_binned_CT" - fields = ("CT",) + factor = "CT" -class CCSummary(ReadqcTracker, SingleTableTrackerRows): +class CCSummary(BiasFactorPlot): table = "means_binned_CC" - fields = ("CC",) + factor = "CC" -class CGSummary(ReadqcTracker, SingleTableTrackerRows): +class CGSummary(BiasFactorPlot): table = "means_binned_CG" - fields = ("CG",) + factor = "CG" -class GASummary(ReadqcTracker, SingleTableTrackerRows): +class GASummary(BiasFactorPlot): table = "means_binned_GA" - fields = ("GA",) + factor = "GA" -class GTSummary(ReadqcTracker, SingleTableTrackerRows): +class GTSummary(BiasFactorPlot): table = "means_binned_GT" - fields = ("GT",) + factor = "GT" -class GCSummary(ReadqcTracker, SingleTableTrackerRows): +class GCSummary(BiasFactorPlot): table = "means_binned_GC" - fields = ("GC",) + factor = "GC" -class GGSummary(ReadqcTracker, SingleTableTrackerRows): +class GGSummary(BiasFactorPlot): table = "means_binned_GG" - fields = ("GG",) + factor = "GG" diff --git a/CGATPipelines/pipeline_readqc.py b/CGATPipelines/pipeline_readqc.py index 51fff273..23c40d2a 100644 --- a/CGATPipelines/pipeline_readqc.py +++ b/CGATPipelines/pipeline_readqc.py @@ -141,6 +141,7 @@ import pandas from pandas import DataFrame from scipy.stats import linregress +import itertools as iter import CGAT.Experiment as E import CGAT.IOTools as IOTools @@ -405,7 +406,7 @@ def characteriseTranscripts(infile, outfile): statement = '''cat %(infile)s''' statement += ''' | python %(scriptsdir)s/fasta2table.py - --split-fasta-identifier --section=dn -v 0 + --split-fasta-identifier --section=na,dn,length -v 0 | gzip > %(outfile)s''' P.run() @@ -422,15 +423,32 @@ def summariseBias(infiles, outfiles): transcripts, expression = infiles out_correlation, out_gradient = outfiles - atr = pandas.read_csv(transcripts, sep='\t', compression="gzip") + atr = pandas.read_csv(transcripts, sep='\t', + compression="gzip", index_col="id") exp = pandas.read_csv(expression, sep='\t', compression="gzip") + atr = atr.rename(columns={'pGC': 'GC_Content'}) + + def percentage(x): + return float(x[0])/float(x[1]) + + for di in iter.product("ATCG", repeat=2): + di = di[0]+di[1] + temp_df = atr.loc[:, [di, "length"]] + atr[di] = temp_df.apply(percentage, axis=1) + + drop_cols = (["nAT", "nGC", "pAT", "pA", "pG", "pC", "pT", "nA", + "nG", "nC", "nT", "ncodons", + "mCountsOthers", "nUnk", "nN", "pN"]) + atr = atr.drop(drop_cols, axis=1) atr["length"] = numpy.log2(atr["length"]) log_exp = numpy.log2(exp.ix[:, 1:]+0.1) log_exp["id"] = exp[["Transcript"]] + log_exp = log_exp.set_index("id") - bias_factors = list(atr.columns[1:]) - samples = list(exp.columns[1:]) + bias_factors = list(atr.columns) + samples = list(exp.columns) + samples.remove("Transcript") merged = atr.merge(log_exp, left_index="id", right_index="id") @@ -442,23 +460,18 @@ def aggregate_by_factor(df, attribute, sample_names, bins, function): temp_dict = dict.fromkeys(sample_names, function) temp_dict[attribute] = function - means_df = merged.groupby(pandas.qcut(df.ix[:, attribute], bins)) means_df = means_df.agg(temp_dict).sort(axis=1) - - corr_matrix = means_df.corr(method='pearson') + corr_matrix = means_df.corr(method='spearman') corr_matrix = corr_matrix[corr_matrix.index != attribute] - factor_gradients = [] for sample in samples: factor_gradients.append(lin_reg_grad(y=means_df[sample], x=means_df[factor])) - return means_df, corr_matrix, factor_gradients corr_matrices = {} gradient_lists = {} - for factor in bias_factors: means_binned, corr_matrix, gradients = aggregate_by_factor( merged, factor, samples, PARAMS["bias_bin"], numpy.mean) @@ -466,7 +479,6 @@ def aggregate_by_factor(df, attribute, sample_names, bins, function): factor, ".tsv") means_binned.to_csv(outfile_means, sep="\t", index=False, float_format='%.4f') - corr_matrices[factor] = list(corr_matrix[factor]) gradient_lists[factor] = gradients