Load and explore the data
Overview
Teaching: 15 min
Exercises: 30 minQuestions
What data are required for eqtl mapping?
Objectives
To provide an example and exploration of data used for eqtl mapping.
Load the libraries.
library(tidyverse)
library(knitr)
library(corrplot)
# the following analysis is from File S1 Attie_eQTL_paper_physiology.Rmd
# compliments of Daniel Gatti. See Data Dryad entry for more information.
Physiological Phenotypes
The complete data used in these analyses are available from Data Dryad.
Load in the clinical phenotypes.
# load the data
load("../data/attie_DO500_clinical.phenotypes.RData")
See the data dictionary to see a description of each of these phenotypes. You can also view a table of the data dictionary.
pheno_clin_dict %>%
select(description, formula) %>%
kable()
description | formula | |
---|---|---|
mouse | Animal identifier. | NA |
sex | Male (M) or female (F). | NA |
sac_date | Date when mouse was sacrificed; used to compute days on diet, using birth dates. | NA |
partial_inflation | Some mice showed a partial pancreatic inflation which would negatively effect the total number of islets collected from these mice. | NA |
coat_color | Visual inspection by Kathy Schuler on coat color. | NA |
oGTT_date | Date the oGTT was performed. | NA |
FAD_NAD_paired | A change in the method that was used to make this measurement by Matt Merrins’ lab. Paired was the same islet for the value at 3.3mM vs. 8.3mM glucose; unpaired was where averages were used for each glucose concentration and used to compute ratio. | NA |
FAD_NAD_filter_set | A different filter set was used on the microscope to make the fluorescent measurement; may have influenced the values. | NA |
crumblers | Some mice store food in their bedding (hoarders) which would be incorrectly interpreted as consumed. | NA |
birthdate | Birth date | NA |
diet_days | Number of days. | NA |
num_islets | Total number of islets harvested per mouse; negatively impacted by those with partial inflation. | NA |
Ins_per_islet | Amount of insulin per islet in units of ng/ml/islet. | NA |
WPIC | Derived number; equal to total number of islets times insulin content per islet. | Ins_per_islet * num_islets |
HOMA_IR_0min | glucose*insulin/405 at time t=0 for the oGTT | Glu_0min * Ins_0min / 405 |
HOMA_B_0min | 360 * Insulin / (Glucose - 63) at time t=0 for the oGTT | 360 * Ins_0min / (Glu_0min - 63) |
Glu_tAUC | Area under the curve (AUC) calculation without any correction for baseline differences. | complicated |
Ins_tAUC | Area under the curve (AUC) calculation without any correction for baseline differences. | complicated |
Glu_6wk | Plasma glucose with units of mg/dl; fasting. | NA |
Ins_6wk | Plasma insulin with units of ng/ml; fasting. | NA |
TG_6wk | Plasma triglyceride (TG) with units of mg/dl; fasting. | NA |
Glu_10wk | Plasma glucose with units of mg/dl; fasting. | NA |
Ins_10wk | Plasma insulin with units of ng/ml; fasting. | NA |
TG_10wk | Plasma triglyceride (TG) with units of mg/dl; fasting. | NA |
Glu_14wk | Plasma glucose with units of mg/dl; fasting. | NA |
Ins_14wk | Plasma insulin with units of ng/ml; fasting. | NA |
TG_14wk | Plasma triglyceride (TG) with units of mg/dl; fasting. | NA |
food_ave | Average food consumption over the measurements made for each mouse. | complicated |
weight_2wk | Body weight at indicated date; units are gm. | NA |
weight_6wk | Body weight at indicated date; units are gm. | NA |
weight_10wk | Body weight at indicated date; units are gm. | NA |
DOwave | Wave (i.e., batch) of DO mice | NA |
# convert sex and DO wave (batch) to factors
pheno_clin$sex = factor(pheno_clin$sex)
pheno_clin$DOwave = factor(pheno_clin$DOwave)
Figure 1 Boxplots
pheno_clin %>%
select(mouse, sex, starts_with("weight")) %>%
gather(week, value, -mouse, -sex) %>%
separate(week, c("tmp", "week")) %>%
mutate(week = factor(week, levels = c("2wk", "6wk", "10wk"))) %>%
ggplot(aes(week, value, fill = sex)) +
geom_boxplot() +
scale_y_log10() +
labs(title = "Body Weight", y = "Weight")
pheno_clin %>%
select(mouse, sex, starts_with("Glu")) %>%
select(mouse, sex, ends_with("wk")) %>%
gather(week, value, -mouse, -sex) %>%
separate(week, c("tmp", "week")) %>%
mutate(week = factor(week, levels = c("6wk", "10wk", "14wk"))) %>%
ggplot(aes(week, value, fill = sex)) +
geom_boxplot() +
scale_y_log10() +
labs(title = "Glucose", y = "Glucose")
pheno_clin %>%
select(mouse, sex, starts_with("Ins")) %>%
select(mouse, sex, ends_with("wk")) %>%
gather(week, value, -mouse, -sex) %>%
separate(week, c("tmp", "week")) %>%
mutate(week = factor(week, levels = c("6wk", "10wk", "14wk"))) %>%
ggplot(aes(week, value, fill = sex)) +
geom_boxplot() +
scale_y_log10() +
labs(title = "Insulin", y = "Insulin")
pheno_clin %>%
select(mouse, sex, starts_with("TG")) %>%
select(mouse, sex, ends_with("wk")) %>%
gather(week, value, -mouse, -sex) %>%
separate(week, c("tmp", "week")) %>%
mutate(week = factor(week, levels = c("6wk", "10wk", "14wk"))) %>%
ggplot(aes(week, value, fill = sex)) +
geom_boxplot() +
scale_y_log10() +
labs(title = "TG", y = "TG")
pheno_clin %>%
select(mouse, sex, num_islets:Ins_tAUC, food_ave) %>%
gather(phenotype, value, -mouse, -sex) %>%
ggplot(aes(sex, value, fill = sex)) +
geom_boxplot() +
scale_y_log10() +
facet_wrap(~phenotype, scales = "free_y")
QA/QC
Log transform and standardize each phenotype. Consider setting points that are more than 5 std. dev. from the mean to NA. Only do this if the final distribution doesn’t look skewed.
pheno_clin_log = pheno_clin %>%
mutate_if(is.numeric, log)
pheno_clin_std = pheno_clin_log %>%
select(mouse, num_islets:weight_10wk) %>%
mutate_if(is.numeric, scale)
pheno_clin_std %>%
select(num_islets:weight_10wk) %>%
gather(phenotype, value) %>%
ggplot(aes(x = phenotype, y = value)) +
geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
labs(title = "Distribution of Standardized Phenotypes")
Tests for sex, wave and diet_days.
tmp = pheno_clin_log %>%
select(mouse, sex, DOwave, diet_days, num_islets:weight_10wk) %>%
gather(phenotype, value, -mouse, -sex, -DOwave, -diet_days) %>%
group_by(phenotype) %>%
nest()
mod_fxn = function(df) {
lm(value ~ sex + DOwave + diet_days, data = df)
}
tmp = tmp %>%
mutate(model = map(data, mod_fxn)) %>%
mutate(summ = map(model, tidy)) %>%
unnest(summ)
Error in `mutate()`:
! Problem while computing `summ = map(model, tidy)`.
ℹ The error occurred in group 1: phenotype = "food_ave".
Caused by error in `as_mapper()`:
! object 'tidy' not found
# kable(tmp, caption = "Effects of Sex, Wave & Diet Days on Phenotypes")
tmp %>%
filter(term != "(Intercept)") %>%
mutate(neg.log.p = -log10(p.value)) %>%
ggplot(aes(term, neg.log.p)) +
geom_point() +
facet_wrap(~phenotype) +
labs(title = "Significance of Sex, Wave & Diet Days on Phenotypes") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
rm(tmp)
Error in `filter()`:
! Problem while computing `..1 = term != "(Intercept)"`.
ℹ The error occurred in group 1: phenotype = "food_ave".
Caused by error in `mask$eval_all_filter()`:
! object 'term' not found
Weight vs. Food Intake
pheno_clin_log %>%
select(mouse, sex, food_ave:weight_10wk) %>%
gather(phenotype, value, -mouse, -sex, -food_ave) %>%
separate(phenotype, c("phenotype", "week")) %>%
mutate(week = factor(week, levels = c("2wk", "6wk", "10wk"))) %>%
ggplot(aes(food_ave, value, color = sex)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Food Intake vs. Body Weight", y = "log(Body Weight)") +
facet_wrap(~week)
model_fxn = function(df) { lm(value ~ sex*food_ave, data = df) }
tmp = pheno_clin_log %>%
select(mouse, sex, food_ave:weight_10wk) %>%
gather(phenotype, value, -mouse, -sex, -food_ave) %>%
separate(phenotype, c("phenotype", "week")) %>%
mutate(week = factor(week, levels = c("2wk", "6wk", "10wk"))) %>%
group_by(week) %>%
nest() %>%
mutate(model = map(data, model_fxn)) %>%
mutate(summ = map(model, tidy)) %>%
unnest(summ) %>%
filter(term != "(Intercept)") %>%
mutate(p.adj = p.adjust(p.value))
Error in `mutate()`:
! Problem while computing `summ = map(model, tidy)`.
ℹ The error occurred in group 1: week = 2wk.
Caused by error in `as_mapper()`:
! object 'tidy' not found
# kable(tmp, caption = "Effects of Sex and Food Intake on Body Weight")
Correlation Plots
Females
tmp = pheno_clin_log %>%
filter(sex == "F") %>%
select(starts_with(c("Ins", "Glu", "TG")))
tmp = cor(tmp, use = "pairwise")
corrplot.mixed(tmp, upper = "ellipse", lower = "number",
main = "Female Clinical Phenotype Correlation")
corrplot.mixed(tmp, upper = "ellipse", lower = "number",
main = "Female Clinical Phenotype Correlation")
Males
tmp = pheno_clin_log %>%
filter(sex == "M") %>%
select(starts_with(c("Ins", "Glu", "TG")))
tmp = cor(tmp, use = "pairwise")
corrplot.mixed(tmp, upper = "ellipse", lower = "number",
main = "Male Clinical Phenotype Correlation")
corrplot.mixed(tmp, upper = "ellipse", lower = "number",
main = "Male Clinical Phenotype Correlation")
Gene Expression Phenotypes
# load the expression data along with annotations and metadata
load("../data/dataset.islet.rnaseq.RData")
names(dataset.islet.rnaseq)
[1] "annots" "covar" "covar.factors" "datatype"
[5] "display.name" "expr" "lod.peaks" "raw"
[9] "samples"
# look at gene annotations
dataset.islet.rnaseq$annots[1:6,]
gene_id symbol chr start end strand
ENSMUSG00000000001 ENSMUSG00000000001 Gnai3 3 108.10728 108.14615 -1
ENSMUSG00000000028 ENSMUSG00000000028 Cdc45 16 18.78045 18.81199 -1
ENSMUSG00000000037 ENSMUSG00000000037 Scml2 X 161.11719 161.25821 1
ENSMUSG00000000049 ENSMUSG00000000049 Apoh 11 108.34335 108.41440 1
ENSMUSG00000000056 ENSMUSG00000000056 Narf 11 121.23725 121.25586 1
ENSMUSG00000000058 ENSMUSG00000000058 Cav2 6 17.28119 17.28911 1
middle nearest.marker.id biotype module
ENSMUSG00000000001 108.12671 3_108090236 protein_coding darkgreen
ENSMUSG00000000028 18.79622 16_18817262 protein_coding grey
ENSMUSG00000000037 161.18770 X_161182677 protein_coding grey
ENSMUSG00000000049 108.37887 11_108369225 protein_coding greenyellow
ENSMUSG00000000056 121.24655 11_121200487 protein_coding brown
ENSMUSG00000000058 17.28515 6_17288298 protein_coding brown
hotspot
ENSMUSG00000000001 <NA>
ENSMUSG00000000028 <NA>
ENSMUSG00000000037 <NA>
ENSMUSG00000000049 <NA>
ENSMUSG00000000056 <NA>
ENSMUSG00000000058 <NA>
# look at raw counts
dataset.islet.rnaseq$raw[1:6,1:6]
ENSMUSG00000000001 ENSMUSG00000000028 ENSMUSG00000000037
DO021 10247 108 29
DO022 11838 187 35
DO023 12591 160 19
DO024 12424 216 30
DO025 10906 76 21
DO026 12248 110 34
ENSMUSG00000000049 ENSMUSG00000000056 ENSMUSG00000000058
DO021 15 120 703
DO022 18 136 747
DO023 18 275 1081
DO024 81 160 761
DO025 7 163 770
DO026 18 204 644
# look at sample metadata
# summarize mouse sex, birth dates and DO waves
table(dataset.islet.rnaseq$samples[, c("sex", "birthdate")])
birthdate
sex 2014-05-29 2014-10-15 2015-02-25 2015-09-22
F 46 46 49 47
M 45 48 50 47
table(dataset.islet.rnaseq$samples[, c("sex", "DOwave")])
DOwave
sex 1 2 3 4 5
F 46 46 49 47 0
M 45 48 50 47 0
In order to make reasonable gene comparisons between samples, the count data need to be normalized. In the quantile-quantile (Q-Q) plot below, count data for the first gene are plotted over a diagonal line tracing a normal distribution for those counts. Notice that most of the count data values lie off of this line, indicating that these gene counts are not normally distributed.
The same plot can be drawn as shown below. The diagonal line represents a standard normal distribution with mean 0 and standard deviation 1. Count data values are plotted against this standard normal distribution.
qqnorm(dataset.islet.rnaseq$raw[,1],
xlab="Theoretical normal percentiles",
ylab="Count percentiles",
main="Count distribution for gene ENSMUSG00000000001")
qqline(dataset.islet.rnaseq$raw[,1])
Q-Q plots for the first six genes show that count data for these genes are not normally distributed. They are also not on the same scale. The y-axis values for each subplot range to 20,000 counts in the first subplot, 250 in the second, 80 in the third, and so on.
par(mfrow=c(2,3))
for (i in 1:6) {
qqnorm(dataset.islet.rnaseq$raw[,i])
qqline(dataset.islet.rnaseq$raw[,i])
}
Q-Q plots of the normalized expression data for the first six genes show that the data values match the diagonal line well, meaning that they are now normally distributed. They are also all on the same scale now as well.
par(mfrow=c(2,3))
for (i in 1:6) {
qqnorm(dataset.islet.rnaseq$expr[,i])
qqline(dataset.islet.rnaseq$expr[,i])
}
Boxplots of raw counts for 5 example genes are shown at left below. Notice that the median count values (horizontal black bar in each boxplot) are not comparable between the genes because the counts are not on the same scale. At right, boxplots for the same 5 genes show normalized count data on the same scale.
par(las=2, mfrow=c(1,2))
boxplot(dataset.islet.rnaseq$raw[,c(5:9)],
main="Raw count distributions for 5 example genes")
boxplot(dataset.islet.rnaseq$expr[,c(5:9)],
main="Normalized count distributions for the same 5 example genes")
# look at normalized counts
dataset.islet.rnaseq$expr[1:6,1:6]
ENSMUSG00000000001 ENSMUSG00000000028 ENSMUSG00000000037
DO021 -0.09858728 0.19145703 0.5364855
DO022 0.97443030 2.41565800 1.1599398
DO023 0.75497867 1.62242658 -0.4767954
DO024 1.21299720 2.79216889 0.7205211
DO025 1.62242658 -0.27918258 0.3272808
DO026 0.48416010 0.03281516 0.8453674
ENSMUSG00000000049 ENSMUSG00000000056 ENSMUSG00000000058
DO021 0.67037652 -1.4903750 0.6703765
DO022 1.03980755 -1.1221937 0.9744303
DO023 0.89324170 0.2317220 1.8245906
DO024 2.15179362 -0.7903569 0.9957498
DO025 -0.03281516 -0.2587715 1.5530179
DO026 0.86427916 -0.3973634 0.2791826
The expression data loaded provides LOD peaks for the eQTL analyses performed in this study. As a preview of what you will be doing next, extract the LOD peaks for chromosome 11.
# look at LOD peaks
dataset.islet.rnaseq$lod.peaks[1:6,]
annot.id marker.id chrom pos lod
1 ENSMUSG00000037922 3_136728699 3 136.728699 11.976856
2 ENSMUSG00000037926 2_164758936 2 164.758936 7.091543
3 ENSMUSG00000037926 5_147178504 5 147.178504 6.248598
4 ENSMUSG00000037933 5_147253583 5 147.253583 8.581871
5 ENSMUSG00000037933 13_6542783 13 6.542783 6.065497
6 ENSMUSG00000037935 11_99340415 11 99.340415 8.089051
# look at chromosome 11 LOD peaks
chr11_peaks <- dataset.islet.rnaseq$annots %>%
select(gene_id, chr) %>%
filter(chr=="11") %>%
left_join(dataset.islet.rnaseq$lod.peaks,
by = c("chr" = "chrom", "gene_id" = "annot.id"))
# look at the first several rows of chromosome 11 peaks
head(chr11_peaks)
gene_id chr marker.id pos lod
1 ENSMUSG00000000049 11 11_109408556 109.40856 8.337996
2 ENSMUSG00000000056 11 11_121434933 121.43493 11.379449
3 ENSMUSG00000000093 11 11_74732783 74.73278 9.889119
4 ENSMUSG00000000120 11 11_95962181 95.96218 11.257591
5 ENSMUSG00000000125 11 <NA> NA NA
6 ENSMUSG00000000126 11 11_26992236 26.99224 6.013533
# how many rows?
dim(chr11_peaks)
[1] 1810 5
# how many rows have LOD scores?
chr11_peaks %>% filter(!is.na(lod)) %>% dim()
[1] 1208 5
# sort chromosome 11 peaks by LOD score
chr11_peaks %>% arrange(desc(lod)) %>% head()
gene_id chr marker.id pos lod
1 ENSMUSG00000020268 11 11_55073940 55.07394 189.2332
2 ENSMUSG00000020333 11 11_53961279 53.96128 147.8134
3 ENSMUSG00000017404 11 11_98062374 98.06237 136.0430
4 ENSMUSG00000093483 11 11_83104215 83.10421 135.1067
5 ENSMUSG00000058546 11 11_78204410 78.20441 132.3365
6 ENSMUSG00000078695 11 11_97596132 97.59613 131.8005
# range of LOD scores and positions
range(chr11_peaks$lod, na.rm = TRUE)
[1] 6.000975 189.233211
range(chr11_peaks$pos, na.rm = TRUE)
[1] 3.126009 122.078650
# view LOD scores by position
chr11_peaks %>% arrange(desc(lod)) %>%
ggplot(aes(pos, lod)) + geom_point()
Warning: Removed 602 rows containing missing values (geom_point).
Key Points