Coder Social home page Coder Social logo

snpdensity's Introduction

Plot Guanine and Cytosine Content per FASTA read

library(seqinr)

Seqs <- read.fasta("GCF0014339351IRGSPgenomic.fasta")
length(Seqs)
[1] 58

There are 58 FASTA reads and we will look at the second and third read to determine and plot Guanine and Cytosine Content

SeqsSeq <- Seqs[[2]]
SeqsSeq2 <- Seqs[[3]]

slidingwindowplot <- function(windowsize, inputseq)
{
  starts <- seq(1, length(inputseq)-windowsize, by = windowsize)
  n <- length(starts)
  chunkGCs <- numeric(n)
  for (i in 1:n) {
    chunk <- inputseq[starts[i]:(starts[i]+windowsize-1)]
    chunkGC <- GC(chunk)
    chunkGCs[i] <- chunkGC
  }
  plot(starts,chunkGCs,type="b",col="black",xlab="Nucleotide start position",ylab="GC content")
  abline(a=mean(chunkGCs),b =0,col="red",lwd=3)
}

par(mfrow=c(2,1))
slidingwindowplot(windowsize = 200000, SeqsSeq)
slidingwindowplot(windowsize = 200000, SeqsSeq2)

Screenshot from 2022-04-05 12-08-21

Quality Control and Number of SNPs called in a sliding window

#Chinese Rice 

setwd("/home/michael/Desktop/GenomicVis")
file <- "freebayes~bwa~IRGSP-1.0~all-mutants-minus-S14~QUAL1000-S15-HOMREF.vcf"
Chrom <- c("NC_029256.1","NC_029257.1","NC_029258.1","NC_029259.1","NC_029260.1","NC_029261.1","NC_029262.1","NC_029263.1","NC_029264.1","NC_029265.1","NC_029266.1","NC_029267.1")
ChromQual(file =file,chromlist = Chrom, windowSize = 1e+05, HighLimQuality = 8000, scalar = 1,ncol=12,binwidth1 = 100,binwidth2 = 1,p1=TRUE,p2=TRUE,p3=TRUE,p4=TRUE,p5=TRUE )

Screenshot from 2022-04-05 12-21-10

SNP Quality Scores with Loess smoothing curve

Screenshot from 2022-04-05 12-20-43

Histogram Number of SNPs Called in sliding window

Screenshot from 2022-04-05 12-20-30

Histogram of Quality Scores

Screenshot from 2022-04-05 12-20-19

Run a Correlation Analysis on some Key Variables

setwd("/home/michael/Desktop/GenomicVis")
file <- "freebayes~bwa~IRGSP-1.0~all-mutants-minus-S14~QUAL1000-S15-HOMREF.vcf"
Chrom <- c("NC_029256.1","NC_029257.1","NC_029258.1","NC_029259.1","NC_029260.1","NC_029261.1","NC_029262.1","NC_029263.1","NC_029264.1","NC_029265.1","NC029266.1","NC_029267.1")
Correlation(file=file,chromlist = Chrom,p1=TRUE,p2=TRUE,p3=TRUE,p4=TRUE,p5=TRUE)

Screenshot from 2022-04-06 09-41-18

Plotting total Depths for each sample

library(vcfR)
#Please refer to powerpoint for more
https://trapa.cz/sites/default/files/r_mol_data_phylogen_2.pdf


setwd("/home/michael/Desktop/GenomicVis")
rice.vcf <- read.vcfR(file = "freebayes~bwa~IRGSP-1.0~all-mutants-minus-S14~QUAL1000-S15-HOMREF.vcf")
strwrap(x=grep(pattern="ID=DP,", x =rice.vcf@meta, value = TRUE))
rice.vcf.dp <- extract.gt(x=rice.vcf,element = "DP", as.numeric = TRUE)
dim(rice.vcf.dp)
head(rice.vcf.dp)
boxplot(x=rice.vcf.dp, col="red",ylab="Depth of coverage",las=3,pch=19)
title("DP per specimen")

barplot(apply(X=rice.vcf.dp,MARGIN=2,FUN=mean, na.rm=TRUE),las=3)
title("Mean DP per specimen")
heatmap.bp(x=rice.vcf.dp[1:3000,1:14],col.ramp = rainbow(n=14,start=0.1))

# Heat map for first 30 variants called
heatmap.bp(x=rice.vcf.dp[1:30,1:14],col.ramp = rainbow(n=14,start=0.1))

Heat map for the first 30 Variants for each sample

Screenshot from 2022-04-06 11-18-05

Heat map for most of the variants called

Screenshot from 2022-04-06 11-17-46

Average Total Depth Coverage Per Sample

Screenshot from 2022-04-06 11-17-25

Depth Calls per sample

Screenshot from 2022-04-06 11-17-05

Inspect the Data Structure Tree

Screenshot from 2022-03-21 16-08-51

VCF File Header UpStream Filtering

Rice_UpstreamFiltering

VCF File Header with Samples Listed

RiceHeader

Separate Samples with Control Sample 15

Rice1_15

RUN the R Script from Snakemake workflow

library(magick)
library(vcfR)
library(rMVP)
#Set the working directory where the Variant Calling File is located
message("Setting Working Directory")
setwd("/home/michael/Desktop/SNPVCFPRACTICE")
message("Calling MVP Data function on all decompressed variant calls")

#Call the MVP Data function
message("Calling the MVP function")


sample<-"S1"
pathtosample <- "VCF/freebayes~bwa~IRGSP-1.0~S1~HOM-VAR.vcf"
out<- paste0("mvp.",sample,".vcf")
memo<-paste0(sample)
dffile<-paste0("mvp.",sample,".vcf.geno.map")

message("Making MVP data S1")
MVP.Data(fileVCF=pathtosample,
           #filePhe="Phenotype.txt",
           fileKin=FALSE,
           filePC=FALSE,
           out=out
)
message("Reading MVP Data S1")
df <- read.table(file = dffile, header=TRUE)
message("Making SNP Density Plots")
MVP.Report.Density(df[,c(1:3)], bin.size = 10000000, col = c("blue", "yellow", "red"), memo = memo, file.type = "jpg", dpi=300)


sample<-"S2"
pathtosample <- "VCF/freebayes~bwa~IRGSP-1.0~S2~HOM-VAR.vcf"
out<- paste0("mvp.",sample,".vcf")
memo<-paste0(sample)
dffile<-paste0("mvp.",sample,".vcf.geno.map")

message("Making MVP data S2")
MVP.Data(fileVCF=pathtosample,
           #filePhe="Phenotype.txt",
           fileKin=FALSE,
           filePC=FALSE,
           out=out
)
message("Reading MVP Data S2")
df <- read.table(file = dffile, header=TRUE)
message("Making SNP Density Plots")
MVP.Report.Density(df[,c(1:3)], bin.size = 10000000, col = c("blue", "yellow", "red"), memo = memo, file.type = "jpg", dpi=300)





sample<-"S3"
pathtosample <- "VCF/freebayes~bwa~IRGSP-1.0~S3~HOM-VAR.vcf"
out<- paste0("mvp.",sample,".vcf")
memo<-paste0(sample)
dffile<-paste0("mvp.",sample,".vcf.geno.map")

message("Making MVP data 3")
MVP.Data(fileVCF=pathtosample,
           #filePhe="Phenotype.txt",
           fileKin=FALSE,
           filePC=FALSE,
           out=out
)
message("Reading MVP Data 3")
df <- read.table(file = dffile, header=TRUE)
message("Making SNP Density Plots")
MVP.Report.Density(df[,c(1:3)], bin.size = 10000000, col = c("blue", "yellow", "red"), memo = memo, file.type = "jpg", dpi=300)


sample<-"S4"
pathtosample <- "VCF/freebayes~bwa~IRGSP-1.0~S4~HOM-VAR.vcf"
out<- paste0("mvp.",sample,".vcf")
memo<-paste0(sample)
dffile<-paste0("mvp.",sample,".vcf.geno.map")

message("Making MVP data 4")
MVP.Data(fileVCF=pathtosample,
           #filePhe="Phenotype.txt",
           fileKin=FALSE,
           filePC=FALSE,
           out=out
)
message("Reading MVP Data 4")
df <- read.table(file = dffile, header=TRUE)
message("Making SNP Density Plots")
MVP.Report.Density(df[,c(1:3)], bin.size = 10000000, col = c("blue", "yellow", "red"), memo = memo, file.type = "jpg", dpi=300)





sample<-"S5"
pathtosample <- "VCF/freebayes~bwa~IRGSP-1.0~S5~HOM-VAR.vcf"
out<- paste0("mvp.",sample,".vcf")
memo<-paste0(sample)
dffile<-paste0("mvp.",sample,".vcf.geno.map")

message("Making MVP data 5")
MVP.Data(fileVCF=pathtosample,
           #filePhe="Phenotype.txt",
           fileKin=FALSE,
           filePC=FALSE,
           out=out
)
message("Reading MVP Data 5")
df <- read.table(file = dffile, header=TRUE)
message("Making SNP Density Plots")
MVP.Report.Density(df[,c(1:3)], bin.size = 10000000, col = c("blue", "yellow", "red"), memo = memo, file.type = "jpg", dpi=300)


sample<-"S6"
pathtosample <- "VCF/freebayes~bwa~IRGSP-1.0~S6~HOM-VAR.vcf"
out<- paste0("mvp.",sample,".vcf")
memo<-paste0(sample)
dffile<-paste0("mvp.",sample,".vcf.geno.map")

message("Making MVP data 6")
MVP.Data(fileVCF=pathtosample,
           #filePhe="Phenotype.txt",
           fileKin=FALSE,
           filePC=FALSE,
           out=out
)
message("Reading MVP Data 6")
df <- read.table(file = dffile, header=TRUE)
message("Making SNP Density Plots")
MVP.Report.Density(df[,c(1:3)], bin.size = 10000000, col = c("blue", "yellow", "red"), memo = memo, file.type = "jpg", dpi=300)





sample<-"S7"
pathtosample <- "VCF/freebayes~bwa~IRGSP-1.0~S7~HOM-VAR.vcf"
out<- paste0("mvp.",sample,".vcf")
memo<-paste0(sample)
dffile<-paste0("mvp.",sample,".vcf.geno.map")

message("Making MVP data 7")
MVP.Data(fileVCF=pathtosample,
           #filePhe="Phenotype.txt",
           fileKin=FALSE,
           filePC=FALSE,
           out=out
)
message("Reading MVP Data 7")
df <- read.table(file = dffile, header=TRUE)
message("Making SNP Density Plots")
MVP.Report.Density(df[,c(1:3)], bin.size = 10000000, col = c("blue", "yellow", "red"), memo = memo, file.type = "jpg", dpi=300)


sample<-"S8"
pathtosample <- "VCF/freebayes~bwa~IRGSP-1.0~S8~HOM-VAR.vcf"
out<- paste0("mvp.",sample,".vcf")
memo<-paste0(sample)
dffile<-paste0("mvp.",sample,".vcf.geno.map")

message("Making MVP data 8")
MVP.Data(fileVCF=pathtosample,
           #filePhe="Phenotype.txt",
           fileKin=FALSE,
           filePC=FALSE,
           out=out
)
message("Reading MVP Data 8")
df <- read.table(file = dffile, header=TRUE)
message("Making SNP Density Plots")
MVP.Report.Density(df[,c(1:3)], bin.size = 10000000, col = c("blue", "yellow", "red"), memo = memo, file.type = "jpg", dpi=300)




sample<-"S9"
pathtosample <- "VCF/freebayes~bwa~IRGSP-1.0~S9~HOM-VAR.vcf"
out<- paste0("mvp.",sample,".vcf")
memo<-paste0(sample)
dffile<-paste0("mvp.",sample,".vcf.geno.map")

message("Making MVP data 9")
MVP.Data(fileVCF=pathtosample,
           #filePhe="Phenotype.txt",
           fileKin=FALSE,
           filePC=FALSE,
           out=out
)
message("Reading MVP Data 9")
df <- read.table(file = dffile, header=TRUE)
message("Making SNP Density Plots")
MVP.Report.Density(df[,c(1:3)], bin.size = 10000000, col = c("blue", "yellow", "red"), memo = memo, file.type = "jpg", dpi=300)





sample<-"S10"
pathtosample <- "VCF/freebayes~bwa~IRGSP-1.0~S10~HOM-VAR.vcf"
out<- paste0("mvp.",sample,".vcf")
memo<-paste0(sample)
dffile<-paste0("mvp.",sample,".vcf.geno.map")

message("Making MVP data 10")
MVP.Data(fileVCF=pathtosample,
           #filePhe="Phenotype.txt",
           fileKin=FALSE,
           filePC=FALSE,
           out=out
)
message("Reading MVP Data 10")
df <- read.table(file = dffile, header=TRUE)
message("Making SNP Density Plots")
MVP.Report.Density(df[,c(1:3)], bin.size = 10000000, col = c("blue", "yellow", "red"), memo = memo, file.type = "jpg", dpi=300)


sample<-"S11"
pathtosample <- "VCF/freebayes~bwa~IRGSP-1.0~S11~HOM-VAR.vcf"
out<- paste0("mvp.",sample,".vcf")
memo<-paste0(sample)
dffile<-paste0("mvp.",sample,".vcf.geno.map")

message("Making MVP data 11")
MVP.Data(fileVCF=pathtosample,
           #filePhe="Phenotype.txt",
           fileKin=FALSE,
           filePC=FALSE,
           out=out
)
message("Reading MVP Data 11")
df <- read.table(file = dffile, header=TRUE)
message("Making SNP Density Plots")
MVP.Report.Density(df[,c(1:3)], bin.size = 10000000, col = c("blue", "yellow", "red"), memo = memo, file.type = "jpg", dpi=300)





sample<-"S12"
pathtosample <- "VCF/freebayes~bwa~IRGSP-1.0~S12~HOM-VAR.vcf"
out<- paste0("mvp.",sample,".vcf")
memo<-paste0(sample)
dffile<-paste0("mvp.",sample,".vcf.geno.map")

message("Making MVP data 12")
MVP.Data(fileVCF=pathtosample,
           #filePhe="Phenotype.txt",
           fileKin=FALSE,
           filePC=FALSE,
           out=out
)
message("Reading MVP Data 12")
df <- read.table(file = dffile, header=TRUE)
message("Making SNP Density Plots")
MVP.Report.Density(df[,c(1:3)], bin.size = 10000000, col = c("blue", "yellow", "red"), memo = memo, file.type = "jpg", dpi=300)


sample<-"S13"
pathtosample <- "VCF/freebayes~bwa~IRGSP-1.0~S13~HOM-VAR.vcf"
out<- paste0("mvp.",sample,".vcf")
memo<-paste0(sample)
dffile<-paste0("mvp.",sample,".vcf.geno.map")

message("Making MVP data 13")
MVP.Data(fileVCF=pathtosample,
           #filePhe="Phenotype.txt",
           fileKin=FALSE,
           filePC=FALSE,
           out=out
)
message("Reading MVP Data 13")
df <- read.table(file = dffile, header=TRUE)
message("Making SNP Density Plots")
MVP.Report.Density(df[,c(1:3)], bin.size = 10000000, col = c("blue", "yellow", "red"), memo = memo, file.type = "jpg", dpi=300)


sample<-"S14"
pathtosample <- "VCF/freebayes~bwa~IRGSP-1.0~S14~HOM-VAR.vcf"
out<- paste0("mvp.",sample,".vcf")
memo<-paste0(sample)
dffile<-paste0("mvp.",sample,".vcf.geno.map")

message("Making MVP data 14")
MVP.Data(fileVCF=pathtosample,
           #filePhe="Phenotype.txt",
           fileKin=FALSE,
           filePC=FALSE,
           out=out
)
message("Reading MVP Data 14")
df <- read.table(file = dffile, header=TRUE)
message("Making SNP Density Plots")
MVP.Report.Density(df[,c(1:3)], bin.size = 10000000, col = c("blue", "yellow", "red"), memo = memo, file.type = "jpg", dpi=300)

The Snakemake file

Screenshot from 2022-03-21 16-13-01

In a SNAKEMAKE Workflow pipeline

Screenshot from 2022-03-21 16-13-58

The Directed Acrylic Graph

Screenshot from 2022-03-21 16-14-53

The final output of the workflow from Sample 1 to Sample 14 in descending order

Total.pdf

SNP Denisity Plots for Samples 1 - 14 against Control Sample 15

Notice Chromosome 6 has magnitude of significance in TOTAL SNPs called

SNP1 SNP2 SNP3 SNP4 SNP5 SNP6 SNP7 SNP8

I removed Homozygous Reference and then performed the command

Nohomozygous

Then I filtered for Homozygous Alternates with this command

command

Now, I run this command to find how many varaints they all share in common

64

Finally, I run this R Script to better classify what these varaints are in the genome

setwd("/home/michael/Desktop/GenomicVis")


library(vcfR)
library(VariantAnnotation)
library(GenomicFeatures)

vcf <- readVcf(file = "HomozygousRecessive.vcf")

rd <- rowRanges(vcf)

# convert annotations to TxDb object
GFFTXB<-makeTxDbFromGFF(file="GCF_001433935.1_IRGSP-1.0_genomic.gff")



#Locate Variants
loc <- locateVariants(rd, GFFTXB, AllVariants())



#How many variants were called in coding locations
length(loc)

#92

# Locate Coding Variants
loc2 <- locateVariants(rd, GFFTXB, CodingVariants())
*genetic variants that lead to changes in the sequence of amino-acid residues in the encoded protein*
  
  
length(loc2)

#0

loc3 <- locateVariants(rd, GFFTXB, IntergenicVariants())
*An intergenic region (IGR) is a stretch of DNA sequences located between genes.[1] Intergenic regions are a subset of noncoding DNA. 
*Occasionally some intergenic DNA acts to control genes nearby, but most of it has no currently known function. It is one of the DNA sequences sometimes referred to as junk DNA, 
length(loc3)

#58


*In genetics, a promoter is a sequence of DNA to which proteins bind to initiate transcription of a single RNA transcript from the DNA downstream of the promoter.

loc4 <- locateVariants(rd, GFFTXB, PromoterVariants())

length(loc4)

#8

loc5 <- locateVariants(rd, GFFTXB, FiveUTRVariants())

* is the region of a messenger RNA (mRNA) that is directly upstream from the initiation codon. 

length(loc5)

#1

loc6 <- locateVariants(rd, GFFTXB, ThreeUTRVariants())

length(loc6)

#0

loc7 <- locateVariants(rd, GFFTXB, SpliceSiteVariants())

length(loc7)

#0

Ranges <- loc@ranges
class(Ranges)
Ranges<-as.data.frame(Ranges)
class(Ranges)
Location <- loc@elementMetadata$LOCATION
class(Location)
Location <- as.data.frame(Location)
class(Location)
GENEID <- loc@elementMetadata@listData$GENEID
class(GENEID)
GENEID <- as.data.frame(GENEID)

df99 <- cbind(Ranges,Location,GENEID)
df99 <- as.data.frame(df99)
class(df99)
str(df99)


write.table(df99, file = "Varaints.csv",sep=",")

Screenshot from 2022-05-09 12-48-07

Sort by whether or not the SNP or INDEL is associated with a gene

Screenshot from 2022-05-09 12-49-40

We can infer from this analysis that chromosomes 1,5, and 12, while they have SNPs, do not all share a common one.

Chrosome 6 has the most shared Insertions/Deletions and SNPs in either the intron region associated with gene LOC9270017 or in a promoter region associated with gen LOC107276875 and LOC107281325 and LOC107281374 and LOC 4341167!

Now I want to filter the first VCF file eliminating Sample 15 and calling only Biallelic SNPs which are homozygous recessive.

This is done through bcftools view and bcftools filter

After filtering there are 50 Pure SNPs remaining.

Screenshot from 2022-05-09 13-57-58

Screenshot from 2022-05-09 13-59-56

So I remap them back to RMVP visual aid

snpdensity's People

Contributors

pbglmichaelhall avatar

Watchers

 avatar

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.