Load and explore the data

Overview

Teaching: 15 min
Exercises: 30 min

Questions

What data are required for eqtl mapping?

Objectives

To provide an example and exploration of data used for eqtl mapping.

Load the libraries.

library(tidyverse)
library(knitr)
library(corrplot)
# the following analysis is from File S1 Attie_eQTL_paper_physiology.Rmd 
# compliments of Daniel Gatti. See Data Dryad entry for more information.

Physiological Phenotypes

The complete data used in these analyses are available from Data Dryad.

Load in the clinical phenotypes.

# load the data
load("../data/attie_DO500_clinical.phenotypes.RData")

See the data dictionary to see a description of each of these phenotypes. You can also view a table of the data dictionary.

pheno_clin_dict %>% 
  select(description, formula) %>% 
  kable()

	description	formula
mouse	Animal identifier.	NA
sex	Male (M) or female (F).	NA
sac_date	Date when mouse was sacrificed; used to compute days on diet, using birth dates.	NA
partial_inflation	Some mice showed a partial pancreatic inflation which would negatively effect the total number of islets collected from these mice.	NA
coat_color	Visual inspection by Kathy Schuler on coat color.	NA
oGTT_date	Date the oGTT was performed.	NA
FAD_NAD_paired	A change in the method that was used to make this measurement by Matt Merrins’ lab. Paired was the same islet for the value at 3.3mM vs. 8.3mM glucose; unpaired was where averages were used for each glucose concentration and used to compute ratio.	NA
FAD_NAD_filter_set	A different filter set was used on the microscope to make the fluorescent measurement; may have influenced the values.	NA
crumblers	Some mice store food in their bedding (hoarders) which would be incorrectly interpreted as consumed.	NA
birthdate	Birth date	NA
diet_days	Number of days.	NA
num_islets	Total number of islets harvested per mouse; negatively impacted by those with partial inflation.	NA
Ins_per_islet	Amount of insulin per islet in units of ng/ml/islet.	NA
WPIC	Derived number; equal to total number of islets times insulin content per islet.	Ins_per_islet * num_islets
HOMA_IR_0min	glucose*insulin/405 at time t=0 for the oGTT	Glu_0min * Ins_0min / 405
HOMA_B_0min	360 * Insulin / (Glucose - 63) at time t=0 for the oGTT	360 * Ins_0min / (Glu_0min - 63)
Glu_tAUC	Area under the curve (AUC) calculation without any correction for baseline differences.	complicated
Ins_tAUC	Area under the curve (AUC) calculation without any correction for baseline differences.	complicated
Glu_6wk	Plasma glucose with units of mg/dl; fasting.	NA
Ins_6wk	Plasma insulin with units of ng/ml; fasting.	NA
TG_6wk	Plasma triglyceride (TG) with units of mg/dl; fasting.	NA
Glu_10wk	Plasma glucose with units of mg/dl; fasting.	NA
Ins_10wk	Plasma insulin with units of ng/ml; fasting.	NA
TG_10wk	Plasma triglyceride (TG) with units of mg/dl; fasting.	NA
Glu_14wk	Plasma glucose with units of mg/dl; fasting.	NA
Ins_14wk	Plasma insulin with units of ng/ml; fasting.	NA
TG_14wk	Plasma triglyceride (TG) with units of mg/dl; fasting.	NA
food_ave	Average food consumption over the measurements made for each mouse.	complicated
weight_2wk	Body weight at indicated date; units are gm.	NA
weight_6wk	Body weight at indicated date; units are gm.	NA
weight_10wk	Body weight at indicated date; units are gm.	NA
DOwave	Wave (i.e., batch) of DO mice	NA

# convert sex and DO wave (batch) to factors
pheno_clin$sex = factor(pheno_clin$sex)
pheno_clin$DOwave = factor(pheno_clin$DOwave)

Figure 1 Boxplots

pheno_clin %>%
  select(mouse, sex, starts_with("weight")) %>%
  gather(week, value, -mouse, -sex) %>%
  separate(week, c("tmp", "week")) %>%
  mutate(week = factor(week, levels = c("2wk", "6wk", "10wk"))) %>%
  ggplot(aes(week, value, fill = sex)) +
    geom_boxplot() +
    scale_y_log10() +
    labs(title = "Body Weight", y = "Weight")

plot of chunk bw_boxplot

pheno_clin %>%
  select(mouse, sex, starts_with("Glu")) %>%
  select(mouse, sex, ends_with("wk")) %>%
  gather(week, value, -mouse, -sex) %>%
  separate(week, c("tmp", "week")) %>%
  mutate(week = factor(week, levels = c("6wk", "10wk", "14wk"))) %>%
  ggplot(aes(week, value, fill = sex)) +
    geom_boxplot() +
    scale_y_log10() +
    labs(title = "Glucose", y = "Glucose")

plot of chunk glucose_boxplot

pheno_clin %>%
  select(mouse, sex, starts_with("Ins")) %>%
  select(mouse, sex, ends_with("wk")) %>%
  gather(week, value, -mouse, -sex) %>%
  separate(week, c("tmp", "week")) %>%
  mutate(week = factor(week, levels = c("6wk", "10wk", "14wk"))) %>%
  ggplot(aes(week, value, fill = sex)) +
    geom_boxplot() +
    scale_y_log10() +
    labs(title = "Insulin", y = "Insulin")

plot of chunk insulin_boxplot

pheno_clin %>%
  select(mouse, sex, starts_with("TG")) %>%
  select(mouse, sex, ends_with("wk")) %>%
  gather(week, value, -mouse, -sex) %>%
  separate(week, c("tmp", "week")) %>%
  mutate(week = factor(week, levels = c("6wk", "10wk", "14wk"))) %>%
  ggplot(aes(week, value, fill = sex)) +
    geom_boxplot() +
    scale_y_log10() +
    labs(title = "TG", y = "TG")

plot of chunk trig_boxplot

pheno_clin %>%
  select(mouse, sex, num_islets:Ins_tAUC, food_ave) %>%
  gather(phenotype, value, -mouse, -sex) %>%
  ggplot(aes(sex, value, fill = sex)) +
    geom_boxplot() +
    scale_y_log10() +
    facet_wrap(~phenotype, scales = "free_y")

plot of chunk fig1_boxplots

QA/QC

Log transform and standardize each phenotype. Consider setting points that are more than 5 std. dev. from the mean to NA. Only do this if the final distribution doesn’t look skewed.

pheno_clin_log = pheno_clin %>%
                   mutate_if(is.numeric, log)
pheno_clin_std = pheno_clin_log %>%
                   select(mouse, num_islets:weight_10wk) %>%
                   mutate_if(is.numeric, scale)
pheno_clin_std %>%
  select(num_islets:weight_10wk) %>%
  gather(phenotype, value) %>%
  ggplot(aes(x = phenotype, y = value)) +
    geom_boxplot() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
    labs(title = "Distribution of Standardized Phenotypes")

plot of chunk pheno_std

Tests for sex, wave and diet_days.

tmp = pheno_clin_log %>%
        select(mouse, sex, DOwave, diet_days, num_islets:weight_10wk) %>%
        gather(phenotype, value, -mouse, -sex, -DOwave, -diet_days) %>%
        group_by(phenotype) %>%
        nest()
mod_fxn = function(df) {
  lm(value ~ sex + DOwave + diet_days, data = df)
}
tmp = tmp %>%
  mutate(model = map(data, mod_fxn)) %>%
  mutate(summ = map(model, tidy)) %>%
  unnest(summ)

Error in `mutate()`:
! Problem while computing `summ = map(model, tidy)`.
ℹ The error occurred in group 1: phenotype = "food_ave".
Caused by error in `as_mapper()`:
! object 'tidy' not found

# kable(tmp, caption = "Effects of Sex, Wave & Diet Days on Phenotypes")

tmp %>%
  filter(term != "(Intercept)") %>%
  mutate(neg.log.p = -log10(p.value)) %>%
  ggplot(aes(term, neg.log.p)) +
    geom_point() +
    facet_wrap(~phenotype) +
    labs(title = "Significance of Sex, Wave & Diet Days on Phenotypes") +
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
rm(tmp)

Error in `filter()`:
! Problem while computing `..1 = term != "(Intercept)"`.
ℹ The error occurred in group 1: phenotype = "food_ave".
Caused by error in `mask$eval_all_filter()`:
! object 'term' not found

Weight vs. Food Intake

pheno_clin_log %>%
  select(mouse, sex, food_ave:weight_10wk) %>%
  gather(phenotype, value, -mouse, -sex, -food_ave) %>%
  separate(phenotype, c("phenotype", "week")) %>%
  mutate(week = factor(week, levels = c("2wk", "6wk", "10wk"))) %>%
  ggplot(aes(food_ave, value, color = sex)) +
    geom_point() +
    geom_smooth(method = "lm") +
    labs(title = "Food Intake vs. Body Weight", y = "log(Body Weight)") + 
    facet_wrap(~week)

plot of chunk bw_vs_food

model_fxn = function(df) { lm(value ~ sex*food_ave, data = df) }
tmp = pheno_clin_log %>%
  select(mouse, sex, food_ave:weight_10wk) %>%
  gather(phenotype, value, -mouse, -sex, -food_ave) %>%
  separate(phenotype, c("phenotype", "week")) %>%
  mutate(week = factor(week, levels = c("2wk", "6wk", "10wk"))) %>%
  group_by(week) %>%
  nest() %>%
  mutate(model = map(data, model_fxn)) %>%
  mutate(summ = map(model, tidy)) %>%
  unnest(summ) %>%
  filter(term != "(Intercept)") %>%
  mutate(p.adj = p.adjust(p.value))

Error in `mutate()`:
! Problem while computing `summ = map(model, tidy)`.
ℹ The error occurred in group 1: week = 2wk.
Caused by error in `as_mapper()`:
! object 'tidy' not found

# kable(tmp, caption = "Effects of Sex and Food Intake on Body Weight")

Correlation Plots

Females

tmp = pheno_clin_log %>% 
        filter(sex == "F") %>%
        select(starts_with(c("Ins", "Glu", "TG")))
tmp = cor(tmp, use = "pairwise")
corrplot.mixed(tmp, upper = "ellipse", lower = "number", 
               main = "Female Clinical Phenotype Correlation")
corrplot.mixed(tmp, upper = "ellipse", lower = "number", 
               main = "Female Clinical Phenotype Correlation")

plot of chunk female_corr_plot

Males

tmp = pheno_clin_log %>% 
        filter(sex == "M") %>%
        select(starts_with(c("Ins", "Glu", "TG")))
tmp = cor(tmp, use = "pairwise")
corrplot.mixed(tmp, upper = "ellipse", lower = "number", 
               main = "Male Clinical Phenotype Correlation")
corrplot.mixed(tmp, upper = "ellipse", lower = "number", 
               main = "Male Clinical Phenotype Correlation")

plot of chunk male_corr_plot

Gene Expression Phenotypes

# load the expression data along with annotations and metadata
load("../data/dataset.islet.rnaseq.RData")
names(dataset.islet.rnaseq)

[1] "annots"        "covar"         "covar.factors" "datatype"     
[5] "display.name"  "expr"          "lod.peaks"     "raw"          
[9] "samples"      

# look at gene annotations
dataset.islet.rnaseq$annots[1:6,]

                              gene_id symbol chr     start       end strand
ENSMUSG00000000001 ENSMUSG00000000001  Gnai3   3 108.10728 108.14615     -1
ENSMUSG00000000028 ENSMUSG00000000028  Cdc45  16  18.78045  18.81199     -1
ENSMUSG00000000037 ENSMUSG00000000037  Scml2   X 161.11719 161.25821      1
ENSMUSG00000000049 ENSMUSG00000000049   Apoh  11 108.34335 108.41440      1
ENSMUSG00000000056 ENSMUSG00000000056   Narf  11 121.23725 121.25586      1
ENSMUSG00000000058 ENSMUSG00000000058   Cav2   6  17.28119  17.28911      1
                      middle nearest.marker.id        biotype      module
ENSMUSG00000000001 108.12671       3_108090236 protein_coding   darkgreen
ENSMUSG00000000028  18.79622       16_18817262 protein_coding        grey
ENSMUSG00000000037 161.18770       X_161182677 protein_coding        grey
ENSMUSG00000000049 108.37887      11_108369225 protein_coding greenyellow
ENSMUSG00000000056 121.24655      11_121200487 protein_coding       brown
ENSMUSG00000000058  17.28515        6_17288298 protein_coding       brown
                   hotspot
ENSMUSG00000000001    <NA>
ENSMUSG00000000028    <NA>
ENSMUSG00000000037    <NA>
ENSMUSG00000000049    <NA>
ENSMUSG00000000056    <NA>
ENSMUSG00000000058    <NA>

# look at raw counts
dataset.islet.rnaseq$raw[1:6,1:6]

      ENSMUSG00000000001 ENSMUSG00000000028 ENSMUSG00000000037
DO021              10247                108                 29
DO022              11838                187                 35
DO023              12591                160                 19
DO024              12424                216                 30
DO025              10906                 76                 21
DO026              12248                110                 34
      ENSMUSG00000000049 ENSMUSG00000000056 ENSMUSG00000000058
DO021                 15                120                703
DO022                 18                136                747
DO023                 18                275               1081
DO024                 81                160                761
DO025                  7                163                770
DO026                 18                204                644

# look at sample metadata
# summarize mouse sex, birth dates and DO waves
table(dataset.islet.rnaseq$samples[, c("sex", "birthdate")])

   birthdate
sex 2014-05-29 2014-10-15 2015-02-25 2015-09-22
  F         46         46         49         47
  M         45         48         50         47

table(dataset.islet.rnaseq$samples[, c("sex", "DOwave")])

   DOwave
sex  1  2  3  4  5
  F 46 46 49 47  0
  M 45 48 50 47  0

In order to make reasonable gene comparisons between samples, the count data need to be normalized. In the quantile-quantile (Q-Q) plot below, count data for the first gene are plotted over a diagonal line tracing a normal distribution for those counts. Notice that most of the count data values lie off of this line, indicating that these gene counts are not normally distributed.

plot of chunk view_manual_qqplot_raw

The same plot can be drawn as shown below. The diagonal line represents a standard normal distribution with mean 0 and standard deviation 1. Count data values are plotted against this standard normal distribution.

qqnorm(dataset.islet.rnaseq$raw[,1], 
     xlab="Theoretical normal percentiles", 
     ylab="Count percentiles",
     main="Count distribution for gene ENSMUSG00000000001")
qqline(dataset.islet.rnaseq$raw[,1]) 

plot of chunk view_qqplot_raw

Q-Q plots for the first six genes show that count data for these genes are not normally distributed. They are also not on the same scale. The y-axis values for each subplot range to 20,000 counts in the first subplot, 250 in the second, 80 in the third, and so on.

par(mfrow=c(2,3))
for (i in 1:6) {
  qqnorm(dataset.islet.rnaseq$raw[,i])
  qqline(dataset.islet.rnaseq$raw[,i])
  }

plot of chunk view_qqplots_raw

Q-Q plots of the normalized expression data for the first six genes show that the data values match the diagonal line well, meaning that they are now normally distributed. They are also all on the same scale now as well.

par(mfrow=c(2,3))
for (i in 1:6) {
  qqnorm(dataset.islet.rnaseq$expr[,i])
  qqline(dataset.islet.rnaseq$expr[,i])
  }

plot of chunk view_qqplots_normalized

Boxplots of raw counts for 5 example genes are shown at left below. Notice that the median count values (horizontal black bar in each boxplot) are not comparable between the genes because the counts are not on the same scale. At right, boxplots for the same 5 genes show normalized count data on the same scale.

par(las=2, mfrow=c(1,2))
boxplot(dataset.islet.rnaseq$raw[,c(5:9)], 
        main="Raw count distributions for 5 example genes")
boxplot(dataset.islet.rnaseq$expr[,c(5:9)], 
        main="Normalized count distributions for the same 5 example genes")

plot of chunk view_example_boxplots

# look at normalized counts
dataset.islet.rnaseq$expr[1:6,1:6]

      ENSMUSG00000000001 ENSMUSG00000000028 ENSMUSG00000000037
DO021        -0.09858728         0.19145703          0.5364855
DO022         0.97443030         2.41565800          1.1599398
DO023         0.75497867         1.62242658         -0.4767954
DO024         1.21299720         2.79216889          0.7205211
DO025         1.62242658        -0.27918258          0.3272808
DO026         0.48416010         0.03281516          0.8453674
      ENSMUSG00000000049 ENSMUSG00000000056 ENSMUSG00000000058
DO021         0.67037652         -1.4903750          0.6703765
DO022         1.03980755         -1.1221937          0.9744303
DO023         0.89324170          0.2317220          1.8245906
DO024         2.15179362         -0.7903569          0.9957498
DO025        -0.03281516         -0.2587715          1.5530179
DO026         0.86427916         -0.3973634          0.2791826

The expression data loaded provides LOD peaks for the eQTL analyses performed in this study. As a preview of what you will be doing next, extract the LOD peaks for chromosome 11.

# look at LOD peaks
dataset.islet.rnaseq$lod.peaks[1:6,]

            annot.id   marker.id chrom        pos       lod
ENSMUSG00000037922 3_136728699     3 136.728699 11.976856
ENSMUSG00000037926 2_164758936     2 164.758936  7.091543
ENSMUSG00000037926 5_147178504     5 147.178504  6.248598
ENSMUSG00000037933 5_147253583     5 147.253583  8.581871
ENSMUSG00000037933  13_6542783    13   6.542783  6.065497
ENSMUSG00000037935 11_99340415    11  99.340415  8.089051

# look at chromosome 11 LOD peaks
chr11_peaks <- dataset.islet.rnaseq$annots %>% 
   select(gene_id, chr) %>% 
   filter(chr=="11") %>%
   left_join(dataset.islet.rnaseq$lod.peaks, 
             by = c("chr" = "chrom", "gene_id" = "annot.id")) 

# look at the first several rows of chromosome 11 peaks
head(chr11_peaks)

             gene_id chr    marker.id       pos       lod
ENSMUSG00000000049  11 11_109408556 109.40856  8.337996
ENSMUSG00000000056  11 11_121434933 121.43493 11.379449
ENSMUSG00000000093  11  11_74732783  74.73278  9.889119
ENSMUSG00000000120  11  11_95962181  95.96218 11.257591
ENSMUSG00000000125  11         <NA>        NA        NA
ENSMUSG00000000126  11  11_26992236  26.99224  6.013533

# how many rows?
dim(chr11_peaks)

[1] 1810    5

# how many rows have LOD scores?
chr11_peaks %>% filter(!is.na(lod)) %>% dim()

[1] 1208    5

# sort chromosome 11 peaks by LOD score
chr11_peaks %>% arrange(desc(lod)) %>% head()

             gene_id chr   marker.id      pos      lod
ENSMUSG00000020268  11 11_55073940 55.07394 189.2332
ENSMUSG00000020333  11 11_53961279 53.96128 147.8134
ENSMUSG00000017404  11 11_98062374 98.06237 136.0430
ENSMUSG00000093483  11 11_83104215 83.10421 135.1067
ENSMUSG00000058546  11 11_78204410 78.20441 132.3365
ENSMUSG00000078695  11 11_97596132 97.59613 131.8005

# range of LOD scores and positions
range(chr11_peaks$lod, na.rm = TRUE)

[1]   6.000975 189.233211

range(chr11_peaks$pos, na.rm = TRUE)

[1]   3.126009 122.078650

# view LOD scores by position
chr11_peaks %>% arrange(desc(lod)) %>% 
  ggplot(aes(pos, lod)) + geom_point()

Warning: Removed 602 rows containing missing values (geom_point).

plot of chunk lod_peaks

Key Points

previous episode

Gene Expression QTL Analysis

next episode