Deep Learning for Genomics: Predicting Gene Expression with PyTorch

Deep learning has moved from hype to essential tool in genomics. Models now routinely outperform hand-engineered features for tasks like predicting gene expression, transcription factor binding, and chromatin accessibility from raw sequence. This post walks through building one such model end-to-end using PyTorch.

The Problem Setup

We’ll predict gene expression levels (log-normalized counts) from the 2 kb promoter sequence upstream of each transcription start site (TSS). This is a well-studied proxy task — promoter sequence encodes a substantial fraction of expression variance across tissues and conditions.

Input: DNA sequence (one-hot encoded, shape [N, 4, 2000]) Output: Predicted log-expression value (scalar per gene)

Data Preparation

Start by extracting promoter sequences with pybedtools and encoding them:

import numpy as np
import pybedtools
from pyfaidx import Fasta

def get_promoter_seqs(gene_bed, genome_fasta, upstream=2000):
    """Extract upstream promoter sequences for each gene TSS."""
    genome = Fasta(genome_fasta)
    promoters = []

    for feature in pybedtools.BedTool(gene_bed):
        chrom = feature.chrom
        if feature.strand == '+':
            start = max(0, feature.start - upstream)
            end   = feature.start
        else:
            start = feature.end
            end   = feature.end + upstream

        seq = str(genome[chrom][start:end]).upper()
        if feature.strand == '-':
            seq = reverse_complement(seq)
        promoters.append(seq)

    return promoters


def one_hot_encode(sequence):
    """Encode a DNA string as a (4, L) float32 array."""
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    ohe = np.zeros((4, len(sequence)), dtype=np.float32)
    for i, base in enumerate(sequence):
        if base in mapping:
            ohe[mapping[base], i] = 1.0
    return ohe


def reverse_complement(seq):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
    return ''.join(complement.get(b, 'N') for b in reversed(seq))

Model Architecture

The model is a convolutional neural network — appropriate for sequence data because convolutions detect motifs regardless of their position. Here’s the architecture diagram:

Input: (batch, 4, 2000)
         │
         ▼
┌─────────────────────────┐
│  Conv1d(4→256, k=15)    │  ← Motif detection layer
│  BatchNorm + ReLU       │
│  MaxPool(k=4, stride=4) │
└───────────┬─────────────┘
            │ (batch, 256, 496)
            ▼
┌─────────────────────────┐
│  Conv1d(256→128, k=5)   │
│  BatchNorm + ReLU       │
│  MaxPool(k=4, stride=4) │
└───────────┬─────────────┘
            │ (batch, 128, 124)
            ▼
┌─────────────────────────┐
│  Conv1d(128→64, k=3)    │
│  BatchNorm + ReLU       │
│  AdaptiveAvgPool(1)     │  ← Collapse to fixed size
└───────────┬─────────────┘
            │ (batch, 64)
            ▼
┌─────────────────────────┐
│  Linear(64→32) + ReLU   │
│  Dropout(0.3)           │
│  Linear(32→1)           │  ← Predicted log-expression
└─────────────────────────┘

Translating this into PyTorch:

import torch
import torch.nn as nn

class PromoterExpressionModel(nn.Module):
    def __init__(self):
        super().__init__()

        self.conv_layers = nn.Sequential(
            # Block 1: motif detection
            nn.Conv1d(4, 256, kernel_size=15, padding=7),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=4, stride=4),

            # Block 2: higher-order feature detection
            nn.Conv1d(256, 128, kernel_size=5, padding=2),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=4, stride=4),

            # Block 3: context integration
            nn.Conv1d(128, 64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),
        )

        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(32, 1),
        )

    def forward(self, x):
        x = self.conv_layers(x)
        return self.fc(x).squeeze(-1)

Dataset and DataLoader

from torch.utils.data import Dataset, DataLoader

class PromoterDataset(Dataset):
    def __init__(self, sequences, expression_values):
        self.X = [one_hot_encode(s) for s in sequences]
        self.y = expression_values.astype(np.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]), torch.tensor(self.y[idx])


# Split into train/val/test (80/10/10)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    sequences, expression, test_size=0.2, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=0.5, random_state=42
)

train_loader = DataLoader(PromoterDataset(X_train, y_train), batch_size=64, shuffle=True)
val_loader   = DataLoader(PromoterDataset(X_val,   y_val),   batch_size=256)

Training Loop

import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model  = PromoterExpressionModel().to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
criterion = nn.MSELoss()

def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    for X_batch, y_batch in loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        pred = model(X_batch)
        loss = criterion(pred, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item() * len(y_batch)
    return total_loss / len(loader.dataset)


def evaluate(model, loader, device):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device)
            preds.append(model(X_batch).cpu())
            targets.append(y_batch)
    preds   = torch.cat(preds)
    targets = torch.cat(targets)
    mse  = nn.MSELoss()(preds, targets).item()
    corr = np.corrcoef(preds.numpy(), targets.numpy())[0, 1]
    return mse, corr


for epoch in range(100):
    train_loss = train_epoch(model, train_loader, optimizer, device)
    val_mse, val_corr = evaluate(model, val_loader, device)
    scheduler.step(val_mse)
    print(f"Epoch {epoch+1:3d} | train_loss={train_loss:.4f} | val_mse={val_mse:.4f} | val_r={val_corr:.3f}")

Model Interpretation with In-Silico Mutagenesis

Understanding why the model makes predictions is crucial in biology. In-silico mutagenesis (ISM) measures how each nucleotide position contributes to the prediction by systematically mutating each position and observing the change in output:

def in_silico_mutagenesis(model, sequence_ohe, device):
    """
    Compute ISM scores for a single (4, L) one-hot encoded sequence.
    Returns a (4, L) importance matrix.
    """
    model.eval()
    L = sequence_ohe.shape[1]
    bases = ['A', 'C', 'G', 'T']

    # Baseline prediction
    x_ref = torch.tensor(sequence_ohe).unsqueeze(0).to(device)
    with torch.no_grad():
        ref_pred = model(x_ref).item()

    ism_scores = np.zeros((4, L))

    for pos in range(L):
        for base_idx in range(4):
            x_mut = sequence_ohe.copy()
            x_mut[:, pos] = 0
            x_mut[base_idx, pos] = 1
            x_mut_t = torch.tensor(x_mut).unsqueeze(0).to(device)
            with torch.no_grad():
                mut_pred = model(x_mut_t).item()
            ism_scores[base_idx, pos] = mut_pred - ref_pred

    return ism_scores

The resulting scores can be visualised as a sequence logo — positions with large absolute values correspond to motifs the model has learned (e.g., TATA box, GC-rich initiator elements).

Practical Advice

Augment with reverse complement. DNA is double-stranded — training on both orientations often improves generalisation:

# During training, randomly flip sequences
if random.random() > 0.5:
    X_batch = torch.flip(X_batch, dims=[1, 2])  # RC: reverse + swap A↔T, C↔G

Use pretrained genomic models. Models like Enformer and Borzoi are pretrained on thousands of genomic tracks. Fine-tuning them on your task is typically far more data-efficient than training from scratch.

Watch for data leakage. Always split by chromosome rather than randomly — nearby genes share regulatory sequence context, and random splits produce inflated metrics.

Deep learning for genomics is a rich and rapidly evolving space. The key is to stay grounded in biology: use your model’s predictions to generate hypotheses you can test in the lab.