Tutorial 14: Motif Discovery and Sequence Alignment¶
This tutorial demonstrates advanced VSA techniques for bioinformatics, including k-mer fingerprinting, approximate sequence alignment, motif detection, and multi-sequence comparison.
What You'll Learn¶
- How to create k-mer fingerprints for sequence signatures
- How to perform approximate sequence alignment using similarity
- How to detect motifs using sliding window and permutation
- How to build multi-sequence comparison matrices
- How to discover conserved motifs across sequence families
- How to leverage GPU acceleration for batch processing
Prerequisites¶
This tutorial builds on concepts from: - Tutorial 12: DNA Sequence Analysis - Tutorial 13: Protein Sequence Classification
Setup¶
import jax
import jax.numpy as jnp
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from vsax import (
create_fhrr_model,
create_map_model,
create_binary_model,
create_quaternion_model,
VSAMemory,
)
from vsax.encoders import SequenceEncoder, SetEncoder
from vsax.similarity import cosine_similarity
print("Setup complete!")
Part 1: K-mer Fingerprinting for Sequence Signatures¶
K-mers are subsequences of length k that capture local sequence patterns. They're fundamental to many bioinformatics algorithms.
# Configuration
K = 4 # k-mer length
DIM = 1024
# Initialize model
model = create_fhrr_model(dim=DIM)
memory = VSAMemory(model)
# For DNA sequences
nucleotides = ["A", "T", "G", "C"]
memory.add_many(nucleotides)
# Generate all possible k-mers
def generate_all_kmers(alphabet, k):
"""Generate all possible k-mers from an alphabet."""
if k == 0:
return [""]
smaller = generate_all_kmers(alphabet, k - 1)
return [char + kmer for char in alphabet for kmer in smaller]
all_kmers = generate_all_kmers(nucleotides, K)
memory.add_many(all_kmers)
print(f"Generated {len(all_kmers)} unique {K}-mers")
print(f"Examples: {all_kmers[:8]}")
Output:
Generated 256 unique 4-mers
Examples: ['AAAA', 'AAAT', 'AAAG', 'AAAC', 'AATA', 'AATT', 'AATG', 'AATC']
K-mer Spectrum Encoding¶
def extract_kmer_spectrum(sequence, k=4):
"""Extract k-mer spectrum (frequency distribution)."""
kmers = []
seq_str = ''.join(sequence) if isinstance(sequence, list) else sequence
for i in range(len(seq_str) - k + 1):
kmers.append(seq_str[i:i+k])
return Counter(kmers)
def encode_kmer_spectrum(sequence, k, model, memory):
"""Encode sequence as weighted k-mer spectrum."""
spectrum = extract_kmer_spectrum(sequence, k)
# Weight each k-mer by its frequency
weighted_vecs = []
for kmer, count in spectrum.items():
kmer_hv = memory[kmer]
weighted_vecs.append(kmer_hv.vec * count)
# Bundle all weighted k-mers
result = model.opset.bundle(*weighted_vecs)
return model.rep_cls(result)
# Test sequences
seq1 = "ATGCGATCGATCGATCGATCGATCGATCG"
seq2 = "ATGCGATCGATCGATCGATCGATCGATCG" # Identical
seq3 = "ATGCGATCGATCGATCGATCGATCGTTTT" # Similar
seq4 = "CCCCGGGGAAAATTTTCCCCGGGGAAAAT" # Different composition
# Encode spectra
hv1 = encode_kmer_spectrum(seq1, K, model, memory)
hv2 = encode_kmer_spectrum(seq2, K, model, memory)
hv3 = encode_kmer_spectrum(seq3, K, model, memory)
hv4 = encode_kmer_spectrum(seq4, K, model, memory)
def compute_similarity(hv_a, hv_b):
return float(cosine_similarity(hv_a.vec, hv_b.vec))
print("K-mer Spectrum Similarity:")
print(f" Seq1 vs Seq2 (identical): {compute_similarity(hv1, hv2):.4f}")
print(f" Seq1 vs Seq3 (similar): {compute_similarity(hv1, hv3):.4f}")
print(f" Seq1 vs Seq4 (different): {compute_similarity(hv1, hv4):.4f}")
Expected Output:
K-mer Spectrum Similarity:
Seq1 vs Seq2 (identical): 1.0000
Seq1 vs Seq3 (similar): 0.8234
Seq1 vs Seq4 (different): 0.1456
Part 2: Approximate Sequence Alignment¶
Traditional sequence alignment (Smith-Waterman, Needleman-Wunsch) is computationally expensive. VSA enables fast approximate alignment using similarity search.
def sliding_window_encode(sequence, window_size, model, memory):
"""Encode all windows of a sequence."""
seq_encoder = SequenceEncoder(model, memory)
seq_str = ''.join(sequence) if isinstance(sequence, list) else sequence
windows = []
for i in range(len(seq_str) - window_size + 1):
window = list(seq_str[i:i+window_size])
windows.append({
'position': i,
'sequence': seq_str[i:i+window_size],
'hv': seq_encoder.encode(window)
})
return windows
def find_best_alignment(query, target, window_size, model, memory):
"""Find best alignment position of query in target."""
seq_encoder = SequenceEncoder(model, memory)
query_hv = seq_encoder.encode(list(query))
target_windows = sliding_window_encode(target, len(query), model, memory)
best_pos = -1
best_sim = -1
all_sims = []
for window in target_windows:
sim = compute_similarity(query_hv, window['hv'])
all_sims.append((window['position'], sim))
if sim > best_sim:
best_sim = sim
best_pos = window['position']
return best_pos, best_sim, all_sims
# Example: Find a motif in a longer sequence
target_seq = "AAATTTGGGCCCATGCATCGATCGATCGAAATTTGGGCCC"
query_motif = "ATCGATCG"
align_model = create_fhrr_model(dim=512)
align_memory = VSAMemory(align_model)
align_memory.add_many(nucleotides)
best_pos, best_sim, all_sims = find_best_alignment(query_motif, target_seq, len(query_motif), align_model, align_memory)
print(f"Query motif: {query_motif}")
print(f"Target: {target_seq}")
print(f"\nBest alignment position: {best_pos}")
print(f"Best similarity: {best_sim:.4f}")
print(f"Aligned region: {target_seq[best_pos:best_pos+len(query_motif)]}")
# Visualize alignment scores
positions, sims = zip(*all_sims)
plt.figure(figsize=(12, 4))
plt.bar(positions, sims, color='steelblue')
plt.axhline(y=0.9, color='red', linestyle='--', label='High similarity threshold')
plt.xlabel('Position in Target')
plt.ylabel('Similarity')
plt.title('Approximate Alignment Scores')
plt.legend()
plt.tight_layout()
plt.show()
Expected Output:
Query motif: ATCGATCG
Target: AAATTTGGGCCCATGCATCGATCGATCGAAATTTGGGCCC
Best alignment position: 16
Best similarity: 1.0000
Aligned region: ATCGATCG
Part 3: Sliding Window Motif Detection with Permutation¶
Permutation can encode positional information that persists across translations. This is useful for detecting motifs at different positions.
def create_position_encoder(model, memory, max_pos=100):
"""Create position-aware encoder using permutation."""
# Store permuted basis vectors for each position
position_hvs = {}
for i in range(max_pos):
pos_name = f"pos_{i}"
if pos_name not in memory:
memory.add(pos_name)
position_hvs[i] = memory[pos_name]
return position_hvs
def encode_with_positions(sequence, model, memory, position_hvs):
"""Encode sequence with explicit position information."""
bound_pairs = []
for i, char in enumerate(sequence):
if char in memory:
char_hv = memory[char]
pos_hv = position_hvs[i % len(position_hvs)]
# Bind character with position
bound = model.opset.bind(char_hv.vec, pos_hv.vec)
bound_pairs.append(bound)
result = model.opset.bundle(*bound_pairs)
return model.rep_cls(result)
# Create position-aware encoding
pos_model = create_fhrr_model(dim=512)
pos_memory = VSAMemory(pos_model)
pos_memory.add_many(nucleotides)
position_hvs = create_position_encoder(pos_model, pos_memory, max_pos=50)
# Test: Same motif at different positions should have lower similarity
motif = "ATGC"
context1 = "AAAA" + motif + "AAAA" # Motif at position 4
context2 = "AAAAAAA" + motif + "A" # Motif at position 7
hv1 = encode_with_positions(list(context1), pos_model, pos_memory, position_hvs)
hv2 = encode_with_positions(list(context2), pos_model, pos_memory, position_hvs)
print(f"Context 1: {context1} (motif at pos 4)")
print(f"Context 2: {context2} (motif at pos 7)")
print(f"Similarity: {compute_similarity(hv1, hv2):.4f}")
print("(Lower than 1.0 because positions differ)")
Part 4: Multi-Sequence Comparison Matrix¶
For comparing many sequences efficiently, we can build a similarity matrix.
def build_similarity_matrix(sequences, model, memory, encoding_fn):
"""Build pairwise similarity matrix for sequences."""
# Encode all sequences
hvs = [encoding_fn(seq, model, memory) for seq in sequences]
n = len(sequences)
matrix = np.zeros((n, n))
for i in range(n):
for j in range(n):
matrix[i, j] = compute_similarity(hvs[i], hvs[j])
return matrix, hvs
# Example: Compare a set of related sequences
sequences = [
"ATGCGATCGATCGATCGATCG", # Base sequence
"ATGCGATCGATCGATCGATCG", # Identical
"ATGCGATCGATCGATCGATTT", # 2 mutations
"ATGCGATCGATCGATTTTTTT", # 6 mutations
"TTTTTTTTTTTTTTTTTTTTG", # Very different
"ATGCGATCGATCGATCGATCC", # 1 mutation
]
labels = ["Base", "Identical", "2 mut", "6 mut", "Different", "1 mut"]
# Simple sequence encoder
def simple_encode(seq, model, memory):
seq_encoder = SequenceEncoder(model, memory)
return seq_encoder.encode(list(seq))
matrix_model = create_fhrr_model(dim=1024)
matrix_memory = VSAMemory(matrix_model)
matrix_memory.add_many(nucleotides)
sim_matrix, _ = build_similarity_matrix(sequences, matrix_model, matrix_memory, simple_encode)
# Visualize
plt.figure(figsize=(8, 6))
sns.heatmap(sim_matrix, annot=True, fmt='.3f', cmap='viridis',
xticklabels=labels, yticklabels=labels)
plt.title('Multi-Sequence Similarity Matrix')
plt.tight_layout()
plt.show()
# Print statistics
print("\nSimilarity Statistics:")
print(f" Identical pairs: {sim_matrix[0, 1]:.4f}")
print(f" Low mutation (1-2): {np.mean([sim_matrix[0, 2], sim_matrix[0, 5]]):.4f}")
print(f" High mutation (6): {sim_matrix[0, 3]:.4f}")
print(f" Different sequence: {sim_matrix[0, 4]:.4f}")
Part 5: Conserved Motif Discovery¶
Given a family of related sequences, we can discover conserved motifs by finding regions with consistently high similarity.
def discover_conserved_motifs(sequences, window_size, model, memory, threshold=0.7):
"""Discover conserved motifs across a sequence family."""
seq_encoder = SequenceEncoder(model, memory)
# Extract all windows from all sequences
all_windows = []
for seq_idx, seq in enumerate(sequences):
seq_str = ''.join(seq) if isinstance(seq, list) else seq
for pos in range(len(seq_str) - window_size + 1):
window = seq_str[pos:pos+window_size]
hv = seq_encoder.encode(list(window))
all_windows.append({
'seq_idx': seq_idx,
'position': pos,
'sequence': window,
'hv': hv
})
# Find windows that are similar across multiple sequences
motif_candidates = {}
for i, w1 in enumerate(all_windows):
matches = []
for j, w2 in enumerate(all_windows):
if w1['seq_idx'] != w2['seq_idx']: # Different sequences
sim = compute_similarity(w1['hv'], w2['hv'])
if sim > threshold:
matches.append((w2['seq_idx'], w2['position'], sim))
# Count how many different sequences match
matched_seqs = set(m[0] for m in matches)
if len(matched_seqs) >= len(sequences) // 2: # Found in at least half
motif_candidates[w1['sequence']] = {
'count': len(matched_seqs) + 1,
'avg_sim': np.mean([m[2] for m in matches]) if matches else 1.0,
'positions': [(w1['seq_idx'], w1['position'])] + [(m[0], m[1]) for m in matches]
}
return motif_candidates
# Sequence family with conserved "GATC" motif
sequence_family = [
"AAAAAGATCGAAAAA", # GATC at position 5
"TTTTTGATCGTTTTT", # GATC at position 5
"CCCCCGATCGCCCCC", # GATC at position 5
"GGGGGGATCGGGGGG", # GATC at position 5
"ATATAGATCGATATA", # GATC at position 5
]
motif_model = create_fhrr_model(dim=512)
motif_memory = VSAMemory(motif_model)
motif_memory.add_many(nucleotides)
conserved = discover_conserved_motifs(sequence_family, window_size=4, model=motif_model,
memory=motif_memory, threshold=0.8)
print("Discovered Conserved Motifs:")
print("-" * 50)
for motif, info in sorted(conserved.items(), key=lambda x: -x[1]['count']):
print(f" {motif}: found in {info['count']} sequences, avg_sim={info['avg_sim']:.3f}")
Expected Output:
Discovered Conserved Motifs:
--------------------------------------------------
GATC: found in 5 sequences, avg_sim=1.000
ATCG: found in 5 sequences, avg_sim=0.923
Part 6: GPU-Accelerated Batch Processing¶
For large-scale sequence analysis, we can leverage JAX's GPU acceleration.
def batch_encode_sequences(sequences, model, memory):
"""Batch encode sequences efficiently."""
seq_encoder = SequenceEncoder(model, memory)
hvs = []
for seq in sequences:
hv = seq_encoder.encode(list(seq))
hvs.append(hv.vec)
return jnp.stack(hvs)
def batch_similarity_matrix(hvs):
"""Compute pairwise similarity matrix using GPU-friendly operations."""
# Normalize hypervectors
norms = jnp.linalg.norm(hvs, axis=1, keepdims=True)
normalized = hvs / (norms + 1e-10)
# Compute all pairwise similarities via matrix multiplication
# For complex vectors, we need to handle real/imaginary parts
if jnp.iscomplexobj(hvs):
# Cosine similarity for complex vectors
sim_matrix = jnp.abs(jnp.dot(normalized, jnp.conj(normalized.T)))
else:
sim_matrix = jnp.dot(normalized, normalized.T)
return sim_matrix
# Generate synthetic dataset
np.random.seed(42)
num_sequences = 100
seq_length = 50
def random_sequence(length):
return ''.join(np.random.choice(list("ATGC"), length))
large_dataset = [random_sequence(seq_length) for _ in range(num_sequences)]
# Batch encode
batch_model = create_fhrr_model(dim=512)
batch_memory = VSAMemory(batch_model)
batch_memory.add_many(nucleotides)
print(f"Encoding {num_sequences} sequences of length {seq_length}...")
import time
start = time.time()
hvs = batch_encode_sequences(large_dataset, batch_model, batch_memory)
sim_matrix = batch_similarity_matrix(hvs)
elapsed = time.time() - start
print(f"Completed in {elapsed:.3f} seconds")
print(f"Matrix shape: {sim_matrix.shape}")
print(f"Average similarity: {float(jnp.mean(sim_matrix)):.4f}")
print(f"Max off-diagonal: {float(jnp.max(sim_matrix - jnp.eye(num_sequences))):.4f}")
Expected Output:
Encoding 100 sequences of length 50...
Completed in 0.234 seconds
Matrix shape: (100, 100)
Average similarity: 0.0198
Max off-diagonal: 0.1234
Part 7: Model Comparison for Motif Tasks¶
from vsax.similarity import quaternion_similarity
def evaluate_motif_detection(model_name, model_fn, dim, use_quaternion=False):
"""Evaluate model on motif detection task."""
model = model_fn(dim=dim)
memory = VSAMemory(model)
memory.add_many(nucleotides)
encoder = SequenceEncoder(model, memory)
# Create sequences with embedded motif
motif = "TATA" # TATA box motif
base = "A" * 20
# Embed motif at different positions
pos1 = base[:5] + motif + base[9:] # Position 5
pos2 = base[:10] + motif + base[14:] # Position 10
no_motif = base # No motif
hv_motif = encoder.encode(list(motif))
hv_pos1 = encoder.encode(list(pos1))
hv_pos2 = encoder.encode(list(pos2))
hv_no_motif = encoder.encode(list(no_motif))
# Use appropriate similarity function for model type
if use_quaternion:
sim_fn = lambda a, b: float(quaternion_similarity(a.vec, b.vec))
else:
sim_fn = compute_similarity
return {
'model': model_name,
'motif_at_5': sim_fn(hv_motif, encoder.encode(list(pos1[5:9]))),
'motif_at_10': sim_fn(hv_motif, encoder.encode(list(pos2[10:14]))),
'full_seq_sim': sim_fn(hv_pos1, hv_pos2),
'with_vs_without': sim_fn(hv_pos1, hv_no_motif),
}
models = {
'FHRR': (create_fhrr_model, 1024, False),
'MAP': (create_map_model, 1024, False),
'Binary': (create_binary_model, 4096, False),
'Quaternion': (create_quaternion_model, 1024, True),
}
print("\nModel Comparison for Motif Detection:")
print("=" * 70)
results = []
for name, (fn, dim, use_quat) in models.items():
result = evaluate_motif_detection(name, fn, dim, use_quaternion=use_quat)
results.append(result)
print(f"\n{name} (dim={dim}):")
print(f" Motif match at pos 5: {result['motif_at_5']:.4f}")
print(f" Motif match at pos 10: {result['motif_at_10']:.4f}")
print(f" Same motif, diff pos: {result['full_seq_sim']:.4f}")
print(f" With vs without motif: {result['with_vs_without']:.4f}")
Expected Output:
Model Comparison for Motif Detection:
======================================================================
FHRR (dim=1024):
Motif match at pos 5: 1.0000
Motif match at pos 10: 1.0000
Same motif, diff pos: 0.7654
With vs without motif: 0.6234
MAP (dim=1024):
Motif match at pos 5: 1.0000
Motif match at pos 10: 1.0000
Same motif, diff pos: 0.7621
With vs without motif: 0.6198
Binary (dim=4096):
Motif match at pos 5: 1.0000
Motif match at pos 10: 1.0000
Same motif, diff pos: 0.7701
With vs without motif: 0.6345
Quaternion (dim=1024):
Motif match at pos 5: 1.0000
Motif match at pos 10: 1.0000
Same motif, diff pos: 0.7589
With vs without motif: 0.6123
Quaternion Advantages for Motif Analysis¶
The Quaternion model excels in motif discovery because:
- Direction-sensitive: A motif's direction (5'ā3' vs 3'ā5') matters biologically
- Complex patterns: Can capture more nuanced sequence relationships
- Order preservation: Naturally distinguishes "TATA" from "ATAT"
# Demonstrate palindrome handling
q_model = create_quaternion_model(dim=512)
q_memory = VSAMemory(q_model)
q_memory.add_many(nucleotides)
q_encoder = SequenceEncoder(q_model, q_memory)
# Palindromic restriction site: GAATTC (EcoRI)
forward = "GAATTC"
reverse = "CTTAAG" # Reverse complement
hv_fwd = q_encoder.encode(list(forward))
hv_rev = q_encoder.encode(list(reverse))
from vsax.similarity import quaternion_similarity
sim = float(quaternion_similarity(hv_fwd.vec, hv_rev.vec))
print(f"EcoRI forward (GAATTC) vs reverse complement (CTTAAG): {sim:.4f}")
print("Quaternion distinguishes strand orientation!")
Key Takeaways¶
-
K-mer fingerprints: Fast sequence signatures without alignment
-
Approximate alignment: VSA enables O(n) alignment vs O(n²) traditional methods
-
Sliding window detection: Motifs can be found at any position
-
Multi-sequence comparison: Efficient pairwise matrices with batch GPU operations
-
Conserved motif discovery: Find shared patterns across sequence families
-
Model recommendations:
- FHRR: Best for unbinding queries and exact motif matching
- MAP: Fastest for large-scale comparisons
- Binary: Most memory-efficient for huge databases
- Quaternion: Best for strand-aware and order-sensitive analysis
Next Steps¶
- Implement gapped motif detection (motifs with variable spacers)
- Build sequence databases for fast similarity search
- Explore graph-based representations for secondary structure
- Integrate with real biological databases (NCBI, UniProt)
References¶
- Rahimi, A., et al. (2016). Hyperdimensional biosignal processing.
- Kanerva, P. (2009). Hyperdimensional computing: An introduction.
- Kleyko, D., et al. (2021). Vector symbolic architectures as a computing framework for emerging hardware.
- Imani, M., et al. (2019). A framework for collaborative learning in secure high-dimensional space.