R
and Basic of plotting with R
Download and install the correct version of R
(and RStudio
if you’d like) from:
http://www.rstudio.com/products/rstudio/download/
Also, copy the following files from the server to your computer or home directory on Xanadu for loading into R
:
/home/FCAM/meds5420/data/ATF1/closest_peak_TSS/peaks_promoters.bed
/home/FCAM/meds5420/data/ATF1/reads_in_peaks/ATF1_summits_rep1_counts_2col.bed
/home/FCAM/meds5420/data/ATF1/reads_in_peaks/ATF1_summits_rep2_counts_2col.bed
Make a directory in your class folder on your machine (or on Xanadu), and move the file there.
R
and Basic of plotting with R
R
: subsetting tablesOpen an R
environment and set you working directory to folder where the peaks_promoters.bed
file is.
getwd() # no input or options needed
## [1] "/Users/guertinlab/Documents/class/2023_lec20"
setwd("/Users/guertinlab/MEDS5420")
getwd()
The peaks_promoters.bed
file contains information on which gene TSS is closest to ATF1 peaks that we called with MACS. Recall that we used bedtools closest
to find the distance between ATF1 summits and TSSs and report them relative to genes with consideration of strand.
Read this table into your R session with read.table
:
#read in table
closest <- read.table("peaks_promoters.bed")
#check structure
head(closest,3)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 1 chr1 778706 778707 ATF1_peak_1 82.7650 chr1 778747 778748 LINC01409 ENSG00000237491 + 41
## 2 chr1 1000890 1000891 ATF1_peak_2 41.9708 chr1 1001138 1001139 ISG15 ENSG00000187608 + 248
## 3 chr1 1471676 1471677 ATF1_peak_3 30.5627 chr1 1471765 1471766 ATAD3B ENSG00000160072 + 89
Let’s check how many chromosomes are present in this data:
#report unique items in a list or vector
unique(closest[,1])
## [1] "chr1" "chr10" "chr11" "chr11_KI270832v1_alt" "chr12" "chr13"
## [7] "chr14" "chr14_GL000225v1_random" "chr14_KI270846v1_alt" "chr15" "chr16" "chr17"
## [13] "chr17_KI270857v1_alt" "chr18" "chr18_GL383572v1_alt" "chr18_KI270863v1_alt" "chr18_KI270911v1_alt" "chr19"
## [19] "chr1_KI270763v1_alt" "chr2" "chr20" "chr21" "chr21_KI270872v1_alt" "chr22"
## [25] "chr22_KI270733v1_random" "chr3" "chr3_KI270779v1_alt" "chr3_KI270924v1_alt" "chr4" "chr5"
## [31] "chr5_GL339449v2_alt" "chr5_KI270791v1_alt" "chr5_KI270897v1_alt" "chr6" "chr6_GL000252v2_alt" "chr6_GL000254v2_alt"
## [37] "chr6_GL000255v2_alt" "chr6_KI270797v1_alt" "chr7" "chr7_KI270809v1_alt" "chr8" "chr8_KI270816v1_alt"
## [43] "chr9" "chr9_KI270718v1_random" "chrUn_GL000216v2" "chrUn_GL000220v1" "chrX" "chrX_KI270880v1_alt"
## [49] "chrX_KI270913v1_alt"
Looks like this is the whole dataset, but what if you only want the part from your chromosome. Here are two ways to do this:
#subset a table based on a logical match to a column
closest_chr17<- subset(closest, closest[,1]=='chr17')
head(closest_chr17, 3)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 203 chr17 782326 782327 ATF1_peak_202 66.2886 chr17 782353 782354 MRM3 ENSG00000171861 + 27
## 204 chr17 4143675 4143676 ATF1_peak_203 14.7257 chr17 4143168 4143169 CYB5D2 ENSG00000167740 + -507
## 205 chr17 4704157 4704158 ATF1_peak_204 30.8864 chr17 4704336 4704337 PELP1 ENSG00000141456 - 179
#subset a table based on a logical match to a column
closest_chr17<- closest[closest[,1]=='chr17',]
#notice the syntax is based on table indexing
head(closest_chr17, 3)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 203 chr17 782326 782327 ATF1_peak_202 66.2886 chr17 782353 782354 MRM3 ENSG00000171861 + 27
## 204 chr17 4143675 4143676 ATF1_peak_203 14.7257 chr17 4143168 4143169 CYB5D2 ENSG00000167740 + -507
## 205 chr17 4704157 4704158 ATF1_peak_204 30.8864 chr17 4704336 4704337 PELP1 ENSG00000141456 - 179
Notes on relational operators for logical tests:
Site: http://sites.stat.psu.edu/~dhunter/R/html/base/html/Comparison.html
Operator : Interpretation
<
y : x is less than y>
y : x is greater than y<=
y : x is less than or equal to y>=
y : x is greater than or equal to y==
y : x is equal to y!=
y : x is not equal to yAlso when conducting logical tests:
&
= and|
= orOne simple way to plot the distribution of a data set is to use boxplots:
boxplot(closest[,12])
median - Middle dark line.
box - Interquartile ranges. i.e. extending from 25th percentile to 75th percentile.
whisker - Extremes (cover 90% of the data).
outliers - Points outside of whiskers.
The default parameters from boxplot
are not great for publication. However, there are options specific to the boxplot function that we can use to make the figure more meaningful and understandable.
For instance: see help(boxplots)
)
outline=F
)border="red"
)lwd=2
)boxplot(closest[,12], outline=FALSE, border='red', col='white', lwd=2)
lwd = line width
col = inside box color
border= border color
outline= display outliers (if TRUE)
boxplot(closest[,12], outline=F, col='blue', lwd=2,
xlab='ATF1 peaks', ylab='distance (bp)',
main = "Distance of peaks from gene TSSs")
hist(closest[,12])
hist
also creates an object with several informative attributes, that you can maintain in your workspace by assigning a variable to hist()
.
closest_hist<-hist(closest[,12], plot=F)
#see names of associated data
names(closest_hist)
## [1] "breaks" "counts" "density" "mids" "xname" "equidist"
# print all associated data
closest_hist
## $breaks
## [1] -300000 -250000 -200000 -150000 -100000 -50000 0 50000 100000 150000 200000 250000 300000
##
## $counts
## [1] 1 0 0 1 8 311 285 6 0 1 1 1
##
## $density
## [1] 3.252033e-08 0.000000e+00 0.000000e+00 3.252033e-08 2.601626e-07 1.011382e-05 9.268293e-06 1.951220e-07 0.000000e+00 3.252033e-08 3.252033e-08 3.252033e-08
##
## $mids
## [1] -275000 -225000 -175000 -125000 -75000 -25000 25000 75000 125000 175000 225000 275000
##
## $xname
## [1] "closest[, 12]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
# print 'summary' of the data
summary(closest_hist)
## Length Class Mode
## breaks 13 -none- numeric
## counts 12 -none- numeric
## density 12 -none- numeric
## mids 12 -none- numeric
## xname 1 -none- character
## equidist 1 -none- logical
Options when plotting histograms
#breaks sets the number of divisions for grouping bars
closest_hist<-hist(closest[,12], plot=T,
main="Distance of ATF1 peaks from gene TSSs",
xlab="Distance (bp)")
One can also add detail to the plot by increasing the number of breaks
in the distribution.
#breaks sets the number of divisions for grouping bars
closest_hist<-hist(closest[,12], plot=T, xlim= c(-50000, 50000), breaks=500,
main="Distance of ATF1 peaks from gene TSSs",
xlab="Distance (bp)")
Having the histogram information saved in your workspace allows you to plot this in different ways:
#plot points of histogram
plot(closest_hist$mids, closest_hist$counts, xlim= c(-50000, 50000),
main="Distance of ATF1 peaks from gene TSSs",
xlab="Distance (bp)", ylab="Frequency")
#plot points of histogram
plot(closest_hist$mids, closest_hist$counts, xlim= c(-50000, 50000),
type="l", lwd=2, col='red',
main="Distance of ATF1 peaks from gene TSSs",
xlab="Distance (bp)", ylab="Frequency")
Options with lines and symbols:
adding plots/lines to graphs:
#plot points of histogram
plot(closest_hist,
main="Distance of ATF1 peaks from gene TSSs", xlab="Distance (bp)",
ylab="Frequency", xlim= c(-50000, 50000))
lines(closest_hist$mids, closest_hist$counts, type="l", lwd=2, col='red')
The cumulative distribution adds to the y-axis as more groups are more groups (x-axis) are encountered. ecdf()
will create an object that can be used for plotting a distribution.
#create ECDF object
closest_cdf <- ecdf(closest[,12])
plot(closest_cdf, xlim= c(-50000, 50000))
Add lines to plots: Lets add a vertical, dotted line at the median:
#first find the median
quantile(closest[,12])
## 0% 25% 50% 75% 100%
## -255380 -64 -1 77 295571
OR
median(closest[,12])
## [1] -1
#create ECDF object
plot(closest_cdf, xlim= c(-50000, 50000),
main = "CDF of ATF1 distance to TSSs",
ylab='fraction', xlab="Distance (bp)")
#add vertical line at median
abline(v=median(closest[,12]), lty=2)
abline(h=.5, lty=2)
See help(legend)
plot(closest_cdf, col='red', xlim= c(-50000, 50000),
main = "CDF of ATF1 distance to TSSs", ylab='fraction', xlab="Distance (bp)")
abline(v=median(closest[,12]), lty=2)
abline(h=.5, lty=2)
#Specify position, symbol type, col, box type
legend('bottomright', c("ATF1"), inset= 0.1, pch=19, col='red')
# symbol can be made a line with "lty="
Saving plots is easy:
You can save the current plot window with:
dev.copy(jpeg,filename="plot.jpg")
dev.off()
It is better to specify the file first, plot to it, and then close the device:
# can be done with jpeg, pdf, tiff, eps, png, etc. formats
pdf(file="plot.jpg", width=4, height=4)
# Do your plotting
# Add a Legend
dev.off()
Functions are routines that perform specific tasks. The basic form of a function is as follows:
function_name <- function(arg_1, arg_2, ...) {
Function body
}
The following function will add two input values together and multiply the result by 14:
add.numbers.fourteen <- function(x, y) {
z = (x + y)*14
return(z)
}
add.numbers.fourteen(1,2)
res = add.numbers.fourteen(1,2)
print(res)
You will:
Get the following files using sftp
from the server if you haven’t already:
get /home/FCAM/meds5420/data/ATF1/closest_peak_TSS/peaks_promoters.bed
get /home/FCAM/meds5420/data/ATF1/reads_in_peaks/ATF1_summits_rep1_counts_2col.bed
get /home/FCAM/meds5420/data/ATF1/reads_in_peaks/ATF1_summits_rep2_counts_2col.bed
Try plotting the intensity of the peaks from the first replicate counts file (y-axis) across your chromosome (peaks/promoters file) (x-axis).
Try plotting the data as a line.
Zoom in to smaller windows by limiting the x-axis to 1mb, 100kb, 10kb windows (make sure you choose a region that sill contains at least one peak or you’ll see very little).
Using one of the replicates for your chromosome, try plotting the following:
Try making a scatterplot to see how well the two replicates agree. Use plot(x,y)
.
Try the same scatterplot with log scaled data: Use log2(data)
.
Add a main title to the graph
Label the x- and y- axes with more descriptive titles
Experiment with different size, shape, and color of the points to see which reveals the most information about the data.
Do you recall the quadratic formula? Me either! But we have Google.
Write a function to solve a quadratic equation and have the results return as a list.
solve: 0.5x^2 -2.5x + 2 = 0
report both solutions for x.
x = read.table('ATF1_summits_rep1_counts_2col.bed', sep = '\t')
closest <- read.table("peaks_promoters.bed")
df.x = merge(x, closest, by.x = "V1", by.y = "V4")
head(df.x)
df.x.chr17 = df.x[df.x$V6=="chr17",]
head(df.x.chr17)
plot(df.x.chr17$V2.y, df.x.chr17$V2.x)
#make a publishable pdf
pdf(file="plot.pdf", width=4, height=4)
plot(df.x.chr17$V2.y, df.x.chr17$V2.x, type = "l")
dev.off()
plot(df.x.chr17$V2.y, df.x.chr17$V2.x, xlim = c(0, 1000000))
plot(df.x.chr17$V2.y, df.x.chr17$V2.x, xlim = c(700000, 800000))
plot(df.x.chr17$V2.y, df.x.chr17$V2.x, xlim = c(780000, 790000))
boxplot(df.x.chr17$V2.x, outline=FALSE, border='red', col='white', lwd=2)
hist(df.x.chr17$V2.y, breaks=100, main="Position of peaks along chr17", xlab="chr17 position")
hist_positions <- hist(df.x.chr17$V2.y, breaks=100, plot=F)
lines(hist_positions$mids, hist_positions$counts, type="l", lwd=2, col='blue')
closest_cdf <- ecdf(abs(df.x.chr17$V12))
plot(closest_cdf,
main = "CDF of ATF1 distance to TSSs",
ylab='fraction', xlab="Distance (bp)")
#add vertical line at median
abline(v=median(abs(df.x.chr17$V12)), lty=2)
abline(h=.5, lty=2)
legend('bottomright', c("ATF1"), inset= 0.1, pch=19, col='black')
#scatter
y = read.table('ATF1_summits_rep2_counts_2col.bed', sep = '\t')
plot(x[,2], y[,2])
plot(x[,2], y[,2], main = "replicate concordance")
plot(x[,2], y[,2], main = "replicate concordance", ylab = "replicate 2 counts", xlab = "replicate 1 counts")
plot(x[,2], y[,2], main = "replicate concordance", ylab = "replicate 2 counts", xlab = "replicate 1 counts", cex = 0.5, pch = 16, col = '#80808080')
# quadratic
quad.formula <- function(a, b, c) {
x1 = (-b + sqrt((b^2) - (4*a*c)))/(2*a)
x2 = (-b - sqrt((b^2) - (4*a*c)))/(2*a)
return(list(x1, x2))
}
answers = quad.formula(0.5, -2.5, 2)
answers[[1]]
answers[[2]]