1 Preparation

We load some new packages.

library(edgeR)          # cpm, etc -- RNA-Seq normalization
library(limma)          # lmFit, etc -- fitting many models
library(ComplexHeatmap) # Heatmap
library(seriation)      # seriate -- nice clustering and ordering
library(reshape2)       # melt -- matrix to long data frame

library(multcomp)       # glht -- linear hypotheses
library(tidyverse)      # working with data frames, plotting

Setup as before with the teeth dataset, if you haven’t already.

teeth <- read_csv("r-linear-files/teeth.csv")
## Rows: 16 Columns: 8
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sample, mouse, tooth
## dbl (5): day, gene_ace, gene_smoc1, gene_pou3f3, gene_wnt2
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
teeth$tooth <- factor(teeth$tooth, levels=c("lower","upper"))
teeth$mouse <- factor(teeth$mouse)

# A convenience to examine different model fits
more_data <- expand.grid(
    day=seq(14.3,18.2,by=0.01),
    tooth=factor(c("lower","upper"), levels=c("lower","upper")))

look <- function(y, fit=NULL) {
    p <- ggplot(teeth,aes(x=day,group=tooth))
    if (!is.null(fit)) {
        more_ci <- cbind(
            more_data, 
            predict(fit, more_data, interval="confidence"))
        p <- p + 
            geom_ribbon(aes(ymin=lwr,ymax=upr), data=more_ci, alpha=0.1) + 
            geom_line(aes(y=fit,color=tooth), data=more_ci)
    }
    p + geom_point(aes(y=y,color=tooth)) +
        labs(y=deparse(substitute(y)))
}

2 Testing many genes with limma

In this section we look at fitting the same matrix of predictors X to many different sets of responses y. We will use the package limma from Bioconductor.

2.1 Load the data

Actually in the teeth dataset, the expression level of all genes was measured!

counts_df <- read_csv("r-linear-files/teeth-read-counts.csv")
counts <- as.matrix( select(counts_df, -gene) )
rownames(counts) <- counts_df$gene

dim(counts)
## [1] 32544    16
counts[1:5,]
##               ind1lower ind1upper ind2lower ind2upper ind3lower ind3upper
## 0610007P14Rik      1721      1846      1708      1877      1443      1534
## 0610009B22Rik       512       466       554       504       501       498
## 0610009L18Rik        68        71       121       129        74       124
## 0610009O20Rik      2583      2797      2840      2975      2675      2690
## 0610010F05Rik      1079      1246      1262       992      1232       992
##               ind4lower ind4upper ind5lower ind5upper ind6lower ind6upper
## 0610007P14Rik      1389      1439      1699      1881      1697      1716
## 0610009B22Rik       478       427       569       614       577       568
## 0610009L18Rik        83        87        86       125        99       135
## 0610009O20Rik      2429      2313      2693      3093      2854      3066
## 0610010F05Rik      1011       977      1175      1384      1235      1221
##               ind7lower ind7upper ind8lower ind8upper
## 0610007P14Rik      1531      1625      1716      1373
## 0610009B22Rik       587       531       611       453
## 0610009L18Rik       130       132       114        92
## 0610009O20Rik      2487      2904      2713      2455
## 0610010F05Rik      1042       998      1079       882

2.2 Model matrix

The column names match our teeth data frame.

colnames(counts)
##  [1] "ind1lower" "ind1upper" "ind2lower" "ind2upper" "ind3lower" "ind3upper"
##  [7] "ind4lower" "ind4upper" "ind5lower" "ind5upper" "ind6lower" "ind6upper"
## [13] "ind7lower" "ind7upper" "ind8lower" "ind8upper"
teeth$sample
##  [1] "ind1lower" "ind1upper" "ind2lower" "ind2upper" "ind3lower" "ind3upper"
##  [7] "ind4lower" "ind4upper" "ind5lower" "ind5upper" "ind6lower" "ind6upper"
## [13] "ind7lower" "ind7upper" "ind8lower" "ind8upper"

We will use limma to fit a linear model to each gene. The same model formula will be used in each case. limma doesn’t automatically convert a formula into a model matrix, so we have to do this step manually. Here I am using a model formula that treats the upper and lower teeth as following a different linear trend over time.

I’m going to rescale the time from zero to one, to aid interpretation.

teeth$time <- (teeth$day - 14.5) / (18-14.5)
teeth
## # A tibble: 16 × 9
##    sample    mouse   day tooth gene_ace gene_smoc1 gene_pou3f3 gene_wnt2  time
##    <chr>     <fct> <dbl> <fct>    <dbl>      <dbl>       <dbl>     <dbl> <dbl>
##  1 ind1lower ind1   14.5 lower     2.69       16.4       15.0     2.85   0    
##  2 ind1upper ind1   14.5 upper     1.79       22.4      186.      8.90   0    
##  3 ind2lower ind2   15   lower     3.16       30.5       12.6     1.70   0.143
##  4 ind2upper ind2   15   upper     3.46       45.4      171.      5.11   0.143
##  5 ind3lower ind3   15.5 lower     4.26       54.7        9.58    0.565  0.286
##  6 ind3upper ind3   15.5 upper     3.38       81.0      158.      4.07   0.286
##  7 ind4lower ind4   16   lower     6.40       77.4       14.7     0.218  0.429
##  8 ind4upper ind4   16   upper     4.98       78.9      154.      2.71   0.429
##  9 ind5lower ind5   16.5 lower     8.37       75.1        7.13    0.197  0.571
## 10 ind5upper ind5   16.5 upper     6.77       91.7      124.      1.34   0.571
## 11 ind6lower ind6   17   lower    13.3        53.5        7.33    0.0738 0.714
## 12 ind6upper ind6   17   upper    11.1        87.9       99.3     1.07   0.714
## 13 ind7lower ind7   17.5 lower    18.2        38.4        7.64    0.105  0.857
## 14 ind7upper ind7   17.5 upper    15.4        62.2       88.3     1.04   0.857
## 15 ind8lower ind8   18   lower    26.9        24.8        6.55    0.0514 1    
## 16 ind8upper ind8   18   upper    22.0        42.9       86.0     0.992  1
X <- model.matrix(~ tooth * time, data=teeth)
X
##    (Intercept) toothupper      time toothupper:time
## 1            1          0 0.0000000       0.0000000
## 2            1          1 0.0000000       0.0000000
## 3            1          0 0.1428571       0.0000000
## 4            1          1 0.1428571       0.1428571
## 5            1          0 0.2857143       0.0000000
## 6            1          1 0.2857143       0.2857143
## 7            1          0 0.4285714       0.0000000
## 8            1          1 0.4285714       0.4285714
## 9            1          0 0.5714286       0.0000000
## 10           1          1 0.5714286       0.5714286
## 11           1          0 0.7142857       0.0000000
## 12           1          1 0.7142857       0.7142857
## 13           1          0 0.8571429       0.0000000
## 14           1          1 0.8571429       0.8571429
## 15           1          0 1.0000000       0.0000000
## 16           1          1 1.0000000       1.0000000
## attr(,"assign")
## [1] 0 1 2 3
## attr(,"contrasts")
## attr(,"contrasts")$tooth
## [1] "contr.treatment"

limma refers to this matrix as the “design”.

2.3 Remove low expression genes

There is little chance of detecting differential expression in genes with very low read counts. Including these genes will require a larger False Discovery Rate correction, and also confuses limma’s Empirical Bayes parameter estimation. edgeR provides a function filterByExpr that performs their recommended level of filtering. The filtering takes into account the complexity of the model being fitted, so the “design” matrix is also needed.

keep <- filterByExpr(counts, design=X)
table(keep)
## keep
## FALSE  TRUE 
## 15333 17211
counts_kept <- counts[keep, ]
dim(counts_kept)
## [1] 17211    16

2.4 Normalize, log transform

Different samples may have produced different numbers of reads. We now normalize this away by converting the read counts to Reads Per Million, and log2 transform the results. There are some subtleties here: “TMM” normalization is used as a small adjustment to the total number of reads in each sample. A small constant “prior count” is added to the counts to avoid calculating log2(0). The edgeR and limma manuals describe these steps in more detail.

dgelist <- calcNormFactors(DGEList(counts_kept))
dgelist$samples
##           group lib.size norm.factors
## ind1lower     1 37892888    0.9951385
## ind1upper     1 39776241    1.0006489
## ind2lower     1 43124853    0.9938861
## ind2upper     1 40966589    0.9996592
## ind3lower     1 41125302    0.9968494
## ind3upper     1 39605386    1.0118981
## ind4lower     1 37753999    0.9993671
## ind4upper     1 36824277    1.0073017
## ind5lower     1 41660207    1.0034294
## ind5upper     1 49706630    1.0005100
## ind6lower     1 44193014    1.0041810
## ind6upper     1 45913656    1.0068377
## ind7lower     1 39910979    1.0010187
## ind7upper     1 42871935    1.0073428
## ind8lower     1 44654050    0.9822320
## ind8upper     1 40770709    0.9901066
log2_cpms <- cpm(dgelist, log=TRUE, prior.count=2)

log2_cpms[1:5,]
##               ind1lower ind1upper ind2lower ind2upper ind3lower ind3upper
## 0610007P14Rik 5.5137232 5.5369080  5.318229  5.519839 5.1394195  5.260202
## 0610009B22Rik 3.7682661 3.5553172  3.697489  3.627011 3.6169198  3.640875
## 0610009L18Rik 0.8885341 0.8732802  1.521584  1.677159 0.8899107  1.651715
## 0610009O20Rik 6.0990178 6.1358785  6.051120  6.183743 6.0289821  6.069733
## 0610010F05Rik 4.8410724 4.9705275  4.882246  4.601168 4.9116869  4.632302
##               ind4lower ind4upper ind5lower ind5upper ind6lower ind6upper
## 0610007P14Rik  5.204066  5.279553  5.346635  5.243012  5.258815  5.216015
## 0610009B22Rik  3.668678  3.531014  3.771826  3.631590  3.705977  3.624689
## 0610009L18Rik  1.168533  1.259087  1.073999  1.356971  1.188301  1.569651
## 0610009O20Rik  6.009578  5.963578  6.010536  5.959802  6.008080  6.052503
## 0610010F05Rik  4.746505  4.721760  4.815369  4.801012  4.801015  4.725784
##               ind7lower ind7upper ind8lower ind8upper
## 0610007P14Rik  5.261881  5.235547  5.291748  5.090033
## 0610009B22Rik  3.881739  3.625672  3.805136  3.494407
## 0610009L18Rik  1.723309  1.634356  1.404433  1.218512
## 0610009O20Rik  5.961120  6.072339  5.951937  5.927528
## 0610010F05Rik  4.707609  4.533373  4.623436  4.452683

2.5 Fitting a model to and testing each gene

We are now ready to fit a linear model to each gene.

fit <- lmFit(log2_cpms, X)

class(fit)
## [1] "MArrayLM"
## attr(,"package")
## [1] "limma"
fit$coefficients[1:5,]
##               (Intercept)  toothupper       time toothupper:time
## 0610007P14Rik    5.347432  0.14681873 -0.1112342     -0.28198951
## 0610009B22Rik    3.674935 -0.06773608  0.1291380     -0.16089160
## 0610009L18Rik    1.006469  0.31302640  0.4517124     -0.28052108
## 0610009O20Rik    6.079268  0.05265949 -0.1284426     -0.04413545
## 0610010F05Rik    4.901942 -0.07194132 -0.2216490     -0.07869981

Significance testing in limma is by the use of linear hypotheses (which limma refers to as “contrasts”). A difference between glht and limma’s contrasts.fit is that limma expects the linear combinations to be in columns rather than rows.

We will first look for genes where the slope over time is not flat, averaging the lower and upper teeth.

# Lower slope: c(0,0,1,0)
# Upper slope: c(0,0,1,1)

K <- rbind( average_slope=c(0,0,1,0.5) )
cfit <- contrasts.fit(fit, t(K))         #linear combinations in columns!
efit <- eBayes(cfit, trend=TRUE)

The call to eBayes does Empirical Bayes squeezing of the residual variance for each gene (see Appendix). This is a bit of magic that allows limma to work well with small numbers of samples. We’re now ready to look at the results.

topTable(efit)
##             logFC  AveExpr         t      P.Value    adj.P.Val        B
## Rnf144b  2.344048 4.485626  33.54415 8.243141e-16 1.418727e-11 25.97583
## Tfap2b  -2.245292 7.468960 -28.79399 8.345315e-15 3.337650e-11 23.96954
## Six2    -1.951400 6.978982 -28.63530 9.072794e-15 3.337650e-11 23.89510
## Vldlr    1.451150 6.814847  28.55403 9.471182e-15 3.337650e-11 23.85677
## Prom2    3.201192 3.680446  28.15059 1.174412e-14 3.337650e-11 23.66441
## Stk32a   4.279093 3.713109  27.97477 1.291029e-14 3.337650e-11 23.57948
## Ace      3.436724 2.819766  27.88199 1.357478e-14 3.337650e-11 23.53439
## Bambi    1.963769 6.105063  26.76410 2.517643e-14 5.416395e-11 22.97582
## Msx2     2.843247 6.254208  25.70379 4.629942e-14 8.853992e-11 22.41856
## Cox4i2   1.986644 3.812147  24.60053 8.957775e-14 1.519419e-10 21.80820

The column adj.P.Val contains FDR adjusted p-values.

The logFC column contains the estimate for the quantity we are interested in, usually this is a log2 fold change. Here, it’s the log2 fold change from day 14.5 to day 18.

We can also ask for results for all genes:

all_results <- topTable(efit, n=Inf)

significant <- all_results$adj.P.Val <= 0.05
table(significant)
## significant
## FALSE  TRUE 
## 10421  6790
# or

dt <- decideTests(efit)
summary(dt)
##        average_slope
## Down            3249
## NotSig         10421
## Up              3541

2.6 Diagnostic plots

2.6.1 MD plot

As always, diagnostic plots are an essential part of the process.

It’s common to plot the logFC column against the average log expression level, an “MD plot” (also sometimes called an “MA plot”).

plotMD(efit, status=dt[,1])

# Or do it ourselves:
ggplot(all_results, aes(x=AveExpr, y=logFC)) + 
    geom_point(size=0.1) +
    geom_point(data=all_results[significant,], size=0.5, color="red")

2.6.2 Volcano plot

Often people will choose genes based both on FDR and log fold change thresholds. The volcano plot provides a way to visualize this. Volcano plots show log fold change on the x-axis and p-values on the y-axis on a log scale. Usually the adjusted p-value (FDR) is used, but you may also sometimes see unadjusted p-values being used.

chosen_results <- all_results |>
    filter(adj.P.Val <= 0.05, abs(logFC) >= 1)

ggplot(all_results, aes(x=logFC, y=-log10(adj.P.Val))) + 
    geom_point(size=0.1) +
    geom_point(data=chosen_results, size=0.5, color="red")

If p-values simply got smaller according to the magnitude of log fold change, the plot would be a perfect V shape. However because different genes have different residual standard deviation, the arms of the V are puffier.

2.6.3 MDS plot

A commonly used first look at the quality of the data is provided by limma’s MDS plot. This is very similar to a PCA plot. A difference is that it concentrates by default on the top 500 most different genes when comparing each pair of samples. The percentages shown on the axes refer to variance explained in these comparisons.

With the MDS plot we are looking for variables in the experiment like treatment groups and batch effects to be reflected in the layout. Here, pleasingly, dimension one corresponds to time. Some separation is seen between corresponding upper and lower molars. There seem to be other things going on in the second dimension that may represent unexpected sources of variation in the data.

plotMDS(log2_cpms)

2.6.4 SA plot

As always, residuals are of interest. Here we will plot the residual standard deviation of each gene on a square root scale, versus the average log expression level. We also see the trend line that was fitted during the Empirical Bayes step.

When we used cpm() earlier to log transform the data, we gave a “prior count” to damp down variation at low expression levels. We can see there is still some mean-variance trend, but limma was able to model it.

plotSA(efit)

2.6.5 Melting the data and using ggplot

For a complex experiment like this, ggplot2 can also be used to look at genes in detail in creative ways. Lets look at the top genes we discovered earlier. To use ggplot, we need to “melt” the data into a “long” data frame. I use reshape2::melt() here because the data is in a matrix. If it were in a data frame, I would have used tidyr::pivot_longer().

genes_wanted <- rownames(all_results) |> head(20)

melted_df <- 
    melt(log2_cpms, varnames=c("gene","sample"), value.name="log2_cpm") |> 
    filter(gene %in% genes_wanted) |>
    mutate(gene = factor(gene, genes_wanted)) |>
    left_join(teeth, by="sample")

ggplot(melted_df) +
    aes(x=day, y=log2_cpm, color=tooth) +
    facet_wrap(~gene, scale="free_y", ncol=5) +
    geom_line() +
    geom_point()

2.6.6 Heatmap

A heatmap of variable genes can be a great way to reveal features of a dataset. Here I show the top 100 genes selected by range of log2 expression. Expression is shown as z-scores (i.e. scaled to have zero mean, and unit standard deviation). Genes are ordered so as to cluster similar patterns of expression. Heatmaps are a complex topic! There are a lot of alternatives to what I’ve done here in terms of gene selection, scaling, and clustering.

# Select some genes to show
gene_ranges <- apply(log2_cpms,1,max) - apply(log2_cpms,1,min) 
genes_wanted <- gene_ranges |> sort(decreasing=TRUE) |> head(100) |> names()

# z-transformation
x <- log2_cpms[genes_wanted,]
z <- t(scale(t(x)))

# Hierarchical clustering of genes, refined with Optimal Leaf Ordering
ordering <- seriate(dist(z), method="OLO")

# Annotate with the original range of the data
row_anno = rowAnnotation(
    "log2 CPM"=anno_points(x, 
        which="row", border=FALSE, size=unit(1,"mm"), width=unit(20,"mm")))

Heatmap(
    z, 
    name="z score",
    show_row_names=FALSE,
    cluster_rows=as.dendrogram(ordering[[1]]),
    cluster_columns=FALSE,
    right_annotation=row_anno)

When we looked for genes changing over time, that’s what we found. When we let the data speak for itself we may see many strange things! This may point us at important things to include in our model or to normalize out of the data.

2.7 Connections to earlier ideas

2.7.1 F test

limma can also perform tests against a null hypothesis in which several coefficients are dropped from the model, allowing tests like those we have performed with anova earlier. Suppose we want to find any deviation from a constant expression level. We can check for this by dropping every coefficient except for the intercept. The eBayes step is still needed.

efit2 <- eBayes(fit, trend=TRUE)
F_test_results <- topTable(efit2, coef=c(2,3,4))
F_test_results
##          toothupper      time toothupper.time   AveExpr        F      P.Value
## Pou3f3   3.75941590 -1.175552     -0.05247009 5.1329668 509.6867 1.297841e-15
## Nkx2-3  -9.11616115 -1.576437      1.10659698 0.7468317 479.4989 2.065496e-15
## Rnf144b -0.49079440  2.225058      0.23798080 4.4856260 398.0548 8.500076e-15
## Six2     0.63094758 -2.128418      0.35403768 6.9789817 384.9094 1.096741e-14
## Tfap2b  -0.02343931 -2.767909      1.04523436 7.4689603 323.2106 4.121277e-14
## Prom2   -0.71336944  3.359247     -0.31610996 3.6804464 310.4695 5.586769e-14
## Vldlr    0.44278542  1.662601     -0.42290134 6.8148472 293.6635 8.508209e-14
## Stk32a  -0.25083812  4.412483     -0.26677943 3.7131094 266.0235 1.794141e-13
## Ace     -0.30614649  3.412649      0.04814970 2.8197656 263.2210 1.943280e-13
## Bambi   -0.07834354  2.159273     -0.39100658 6.1050626 251.9731 2.700563e-13
##            adj.P.Val
## Pou3f3  1.777462e-11
## Nkx2-3  1.777462e-11
## Rnf144b 4.719004e-11
## Six2    4.719004e-11
## Tfap2b  1.418626e-10
## Prom2   1.602565e-10
## Vldlr   2.091925e-10
## Stk32a  3.716199e-10
## Ace     3.716199e-10
## Bambi   4.647939e-10

2.7.2 Challenge - visualize different sets of genes

Try visualizing different sets of genes, either using the ggplot2 facet method or the Heatmap method we saw earlier.

Some possible sets of genes are:

  • The top genes from the F test we just did, or another test of your own devising.
  • The largest log fold changes in the all_results table, ignoring p-values.
  • A random sample of genes, using sample().
  • Heatmap with many more genes shown.

2.7.3 Relation to lm( ) and glht( )

Let’s look at a specific gene.

rnf144b <- log2_cpms["Rnf144b",]
rnf144b_fit <- lm(rnf144b ~ tooth * time, data=teeth)

# This is needed to make look() work with the rescaled time
more_data$time <- (more_data$day - 14.5) / (18-14.5)

look(rnf144b, rnf144b_fit)

We can use the same linear hypothesis with glht. The estimate is the same as reported by topTable, but limma gained some power by shrinking the variance toward the trend line, so limma’s p-value is smaller.

summary( glht(rnf144b_fit, K) )
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: lm(formula = rnf144b ~ tooth * time, data = teeth)
## 
## Linear Hypotheses:
##                    Estimate Std. Error t value Pr(>|t|)    
## average_slope == 0  2.34405    0.06559   35.73 1.47e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)

2.7.4 Confidence intervals

Confidence Intervals may also be of interest. However note that these are not adjusted for multiple testing (see Appendix).

topTable(efit, confint=0.95)
##             logFC      CI.L      CI.R  AveExpr         t      P.Value
## Rnf144b  2.344048  2.195431  2.492666 4.485626  33.54415 8.243141e-16
## Tfap2b  -2.245292 -2.411133 -2.079451 7.468960 -28.79399 8.345315e-15
## Six2    -1.951400 -2.096332 -1.806467 6.978982 -28.63530 9.072794e-15
## Vldlr    1.451150  1.343065  1.559236 6.814847  28.55403 9.471182e-15
## Prom2    3.201192  2.959342  3.443041 3.680446  28.15059 1.174412e-14
## Stk32a   4.279093  3.953777  4.604410 3.713109  27.97477 1.291029e-14
## Ace      3.436724  3.174579  3.698869 2.819766  27.88199 1.357478e-14
## Bambi    1.963769  1.807721  2.119818 6.105063  26.76410 2.517643e-14
## Msx2     2.843247  2.607992  3.078502 6.254208  25.70379 4.629942e-14
## Cox4i2   1.986644  1.814894  2.158394 3.812147  24.60053 8.957775e-14
##            adj.P.Val        B
## Rnf144b 1.418727e-11 25.97583
## Tfap2b  3.337650e-11 23.96954
## Six2    3.337650e-11 23.89510
## Vldlr   3.337650e-11 23.85677
## Prom2   3.337650e-11 23.66441
## Stk32a  3.337650e-11 23.57948
## Ace     3.337650e-11 23.53439
## Bambi   5.416395e-11 22.97582
## Msx2    8.853992e-11 22.41856
## Cox4i2  1.519419e-10 21.80820

2.8 Differential expression methods in general usage

2.8.1 limma-voom

Above, we did a fairly straightforward log transformation of the count data, followed by linear modelling. Personally I think this is adequate for most cases, but some further refinements have been developed.

A step up from this, and probably the most popular method, is “voom”. This again works on log transformed data, but uses a precision weight for each individual count, derived from an initial fit of the data, to account for the larger amount of noise associated with lower expression levels on a log scale. With the eBayes(trend=TRUE) method we used above, we accounted for this only at the gene level. With “voom”, the adjustment is applied at the inidividual count level, based on the predicted expression level from the initial fit and also the library size for each sample, so it is more fine-grained.

voomed <- voom(dgelist, design=X, plot=TRUE)

voomed_fit <- lmFit(voomed, design=X)
voomed_cfit <- contrasts.fit(voomed_fit, t(K))
voomed_efit <- eBayes(voomed_cfit)
topTable(voomed_efit)
##             logFC  AveExpr         t      P.Value    adj.P.Val        B
## Rnf144b  2.370894 4.482956  35.07500 4.752489e-16 3.728250e-12 26.72120
## Prom2    3.347961 3.675104  35.23199 4.441806e-16 3.728250e-12 26.39172
## Ace      3.560359 2.809987  34.35721 6.498605e-16 3.728250e-12 25.15308
## Tfap2b  -2.242440 7.468614 -27.94478 1.466804e-14 4.582943e-11 23.78511
## Vldlr    1.451944 6.814360  27.62693 1.742059e-14 4.582943e-11 23.61460
## Six2    -1.955022 6.978507 -27.59096 1.776503e-14 4.582943e-11 23.59363
## Bambi    1.982702 6.104233  27.50288 1.863959e-14 4.582943e-11 23.53954
## Cox4i2   1.954738 3.808062  25.78108 4.918868e-14 1.058233e-10 22.26078
## Stk32a   4.007612 3.706738  25.56528 5.579255e-14 1.066940e-10 22.10362
## Ccnd1    1.839825 7.758813  23.93883 1.491823e-13 2.567576e-10 21.46602

The results with this data are quite similar. Voom may have an advantage where samples have different library sizes. It also has an extension to account for samples of varying quality by estimating “sample weights”.

2.8.2 edgeR

A further step is to use a negative-binomial GLM. Popular packages are DESeq2 and edgeR. Here we’ll demonstrate edgeR. edgeR does a number of sophisticated things beyond using a GLM, and the authors seem to always working on it to make it better. Here we use the edgeR “quasi-likelihood” method, their latest iteration.

dgelist_with_disp <- estimateDisp(dgelist, X)
edger_fit <- glmQLFit(dgelist_with_disp, X)
edger_results <- glmQLFTest(edger_fit, contrast=t(K))
topTags(edger_results)
## Coefficient:  1*time 0.5*toothupper:time 
##              logFC   logCPM         F       PValue          FDR
## Rnf144b   2.354059 4.698677 1154.4667 5.045020e-16 8.682984e-12
## Ace       3.502027 3.259982  966.3023 1.979132e-15 1.173677e-11
## Prom2     3.258979 4.139165  962.0111 2.045802e-15 1.173677e-11
## Six2     -1.951893 7.170847  809.1566 7.687806e-15 3.077752e-11
## Tfap2b   -2.248615 7.667526  789.9533 9.236532e-15 3.077752e-11
## Vldlr     1.450302 6.897344  772.9341 1.090831e-14 3.077752e-11
## Stk32a    4.179573 4.318189  759.1936 1.251773e-14 3.077752e-11
## Bambi     1.962854 6.261558  723.9216 1.798412e-14 3.869058e-11
## Ptprt     2.812593 3.399624  652.5455 3.969246e-14 7.590521e-11
## Hbb-y   -10.200341 4.236711  575.2606 5.523972e-14 9.031048e-11

This is a more principled approach than the log-counts based methods. I would note however that GLMs attempt to fit the mean expected expression levels on a linear scale, and can be overly influenced by large individual counts. We sometimes see “significant” results based on large counts in just one or two samples.

All of these methods produce similar numbers of significant genes with this data.

decideTests(efit) |> summary()
##        average_slope
## Down            3249
## NotSig         10421
## Up              3541
decideTests(voomed_efit) |> summary()
##        average_slope
## Down            3185
## NotSig         10555
## Up              3471
decideTests(edger_results) |> summary()
##        1*time 0.5*toothupper:time
## Down                         3242
## NotSig                      10446
## Up                           3523

2.8.3 Typical design matrix setup

Traditionally the design matrix for differential expression does not include an intercept, using a formula like ~ 0 + group. This makes it easier to write out contrasts, at least for simple experimental designs.

As we discussed earlier, this doesn’t make a difference to the results. Well, it shouldn’t make a difference to the results. Because of a calculation shortcut the limma-voom method takes when using weights, there can be very slight differences if using this method, with perfectly accurate results if ~ 0 + group is used and very slightly inaccurate results if ~ group is used. For more complex designs, such as designs that account for a batch effect, contrastAsCoef() could potentially be used to get perfectly accurate results with the limma-voom method. See the note in the documentation for contrasts.fit. I have never seen anyone bother to do this in practice though!

For example our PVC dataset earlier would traditionally be set up like this.

pvc <- read_csv("r-linear-files/pvc.csv")

pvc_design <- model.matrix(~ 0 + operator + resin, data=pvc)

pvc_contrasts <- makeContrasts(
    Bob_vs_Alice = operatorBob - operatorAlice,
    Carl_vs_Alice = operatorCarl - operatorAlice,
    Carl_vs_Bob = operatorCarl - operatorBob,
    levels=pvc_design)

pvc_contrasts
##                Contrasts
## Levels          Bob_vs_Alice Carl_vs_Alice Carl_vs_Bob
##   operatorAlice           -1            -1           0
##   operatorBob              1             0          -1
##   operatorCarl             0             1           1
##   resinR2                  0             0           0
##   resinR3                  0             0           0
##   resinR4                  0             0           0
##   resinR5                  0             0           0
##   resinR6                  0             0           0
##   resinR7                  0             0           0
##   resinR8                  0             0           0

It is a very common pattern in RNA-Seq experiments to have a set of treatment groups and a set of batches, with all of the treatments performed once in each batch. Including the batch in the model then often increases the statistical power to detect differences between groups.

2.9 Further exercises

How would you find genes that:

  1. Differ in slope between lower and upper molars?

  2. Differ in expression at time 0.5 between the lower and upper molars? (Hint: this can be viewed as the difference in predictions between two individual samples.)

  3. Have non-zero slope in either or both the lower and upper molars?

  4. Have a non-linear patter or expression, such as quadratic?

3 Appendix: more on limma

3.1 Empirical Bayes variance squeezing

In limma, Empirical Bayes squeezing of the residual variance acts as though we have some number of extra “prior” observations of the variance. These are also counted as extra degrees of freedom in F tests. The “prior” observations act to squeeze the estimated residual variance toward a trend line that is a function of the average expression level.

efit <- eBayes(cfit, trend=TRUE)

efit$df.prior
## [1] 3.388566
efit$df.residual
##  [1] 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12
##  [ reached getOption("max.print") -- omitted 17190 entries ]
efit$df.total
##  [1] 15.38857 15.38857 15.38857 15.38857 15.38857 15.38857 15.38857 15.38857
##  [9] 15.38857 15.38857 15.38857 15.38857 15.38857 15.38857 15.38857 15.38857
## [17] 15.38857 15.38857 15.38857 15.38857 15.38857
##  [ reached getOption("max.print") -- omitted 17190 entries ]
plotSA(efit)
points(efit$Amean, efit$s2.post^0.25, col="red", cex=0.2)

The total effective degrees of freedom is the “prior” degrees of freedom plus the normal residual degrees of freedom. As can be seen in the plot, compared to the residual variance (black dots), this produces a posterior residual variance (efit$s2.post, red dots) that is squeezed toward the trend line.

It’s worthwhile checking df.prior when using limma, as a low value may indicate a problem with a data-set.

3.2 Top confident effect sizes

A little self-promotion: With limma we are able to find genes where our effect of interest is significantly different from zero. However we may make many thousands of discoveries, too many to easily follow up, and some of the effects discovered may be tiny. I have developed a method to rank genes by confident effect size, with multiple testing correction, implemented in the package topconfects. This method builds on the treat method provided by limma.

This is performed with the limma_confects function. limma_confects incorporates the Empirical Bayes variance squeezing step, so remember to specify trend=TRUE if using trend based variance squeezing.

library(topconfects)

result <- limma_confects(cfit, "average_slope", trend=TRUE, fdr=0.05)
result
## $table
##    confect effect AveExpr name   
## 1  -4.796  -9.122  1.6070 Hbb-y  
## 2   4.750   7.011  4.1466 Mmp13  
## 3   4.438   6.241  2.4948 Ecel1  
## 4   4.173   7.171  1.2460 Tbx22  
## 5   4.173   6.932 -1.4334 Scgn   
## 6  -4.066  -5.828  0.7321 Hba-x  
## 7   4.066   7.012  3.5122 Igfbpl1
## 8   3.946   5.798 -1.2908 Sdr16c6
## 9   3.760   5.965  5.7260 Vwde   
## 10  3.760   5.901  3.2184 Adgrf4 
## ...
## 6790 of 17211 non-zero log2 fold change at FDR 0.05
## Prior df 3.4

Results are ranked by “confect”, confident effect size. If you select genes with abs(confect) >= some value, those genes will have a magnitude of effect greater than that threshold, with controlled FDR. (Exactly the same set of genes is found using treat with this threshold.)

Compared to the “most significant” gene, the top gene by topconfects has a larger slope (but lower overall average expression).

hbby <- log2_cpms["Hbb-y",]
hbby_fit <- lm(hbby ~ tooth * day, data=teeth)
look(hbby, hbby_fit)

# Highlight genes with logFC confidently larger than 1.
ggplot(result$table, aes(x=AveExpr, y=effect)) +
    geom_point(size=0.1) +
    geom_point(
        data=filter(result$table, abs(confect) >= 1), 
        size=0.5, color="red")

See also “False Coverage-statement Rate” for a generally applicable approach to multiple-testing corrected confidence intervals.


sessionInfo()
## R version 4.4.3 (2025-02-28)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sonoma 14.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
## 
## time zone: Australia/Melbourne
## tzcode source: internal
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] topconfects_1.22.0    lubridate_1.9.4       forcats_1.0.0        
##  [4] stringr_1.5.1         dplyr_1.1.4           purrr_1.0.4          
##  [7] readr_2.1.5           tidyr_1.3.1           tibble_3.2.1         
## [10] ggplot2_3.5.1         tidyverse_2.0.0       multcomp_1.4-28      
## [13] TH.data_1.1-3         MASS_7.3-65           survival_3.8-3       
## [16] mvtnorm_1.3-3         reshape2_1.4.4        seriation_1.5.7      
## [19] ComplexHeatmap_2.22.0 edgeR_4.4.2           limma_3.62.2         
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1    farver_2.1.2        fastmap_1.2.0      
##  [4] TSP_1.2-4           digest_0.6.37       timechange_0.3.0   
##  [7] lifecycle_1.0.4     cluster_2.1.8.1     Cairo_1.6-2        
## [10] statmod_1.5.0       magrittr_2.0.3      compiler_4.4.3     
## [13] rlang_1.1.5         sass_0.4.9          tools_4.4.3        
## [16] utf8_1.2.4          yaml_2.3.10         knitr_1.50         
## [19] labeling_0.4.3      bit_4.6.0           plyr_1.8.9         
## [22] RColorBrewer_1.1-3  registry_0.5-1      ca_0.71.1          
## [25] withr_3.0.2         BiocGenerics_0.52.0 stats4_4.4.3       
## [28] colorspace_2.1-1    scales_1.3.0        iterators_1.0.14   
## [31] cli_3.6.4           rmarkdown_2.29      crayon_1.5.3       
## [34] generics_0.1.3      tzdb_0.5.0          rjson_0.2.23       
## [37] cachem_1.1.0        splines_4.4.3       assertthat_0.2.1   
## [40] parallel_4.4.3      matrixStats_1.5.0   vctrs_0.6.5        
## [43] Matrix_1.7-3        sandwich_3.1-1      jsonlite_1.9.1     
## [46] IRanges_2.40.1      GetoptLong_1.0.5    hms_1.1.3          
## [49] S4Vectors_0.44.0    bit64_4.6.0-1       clue_0.3-66        
## [52] magick_2.8.5        locfit_1.5-9.12     foreach_1.5.2      
## [55] jquerylib_0.1.4     glue_1.8.0          codetools_0.2-20   
## [58] stringi_1.8.4       shape_1.4.6.1       gtable_0.3.6       
## [61] munsell_0.5.1       pillar_1.10.1       htmltools_0.5.8.1  
## [64] circlize_0.4.16     R6_2.6.1            doParallel_1.0.17  
## [67] vroom_1.6.5         evaluate_1.0.3      lattice_0.22-6     
## [70] png_0.1-8           bslib_0.9.0         Rcpp_1.0.14        
## [73] xfun_0.51           zoo_1.8-13          pkgconfig_2.0.3    
## [76] GlobalOptions_0.1.2

Home