@article{6dc39bbb982e4b819415aef5f1919c04,
title = "Detecting protein sequence conservation via metric embeddings",
abstract = "Motivation: Comparing two protein databases is a fundamental task in biosequence annotation. Given two databases, one must find all pairs of proteins that align with high score under a biologically meaningful substitution score matrix, such as a BLOSUM matrix (Henikoff and Henikoff, 1992). Distance-based approaches to this problem map each peptide in the database to a point in a metric space, such that peptides aligning with higher scores are mapped to closer points. Many techniques exist to discover close pairs of points in a metric space efficiently, but the challenge in applying this work to proteomic comparison is to find a distance mapping that accurately encodes all the distinctions among residue pairs made by a proteomic score matrix. Buhler (2002) proposed one such mapping but found that it led to a relatively inefficient algorithm for protein-protein comparison. Results: This work proposes a new distance mapping for peptides under the BLOSUM matrices that permits more efficient similarity search. We first propose a new distance function on peptides derived from a given score matrix. We then show how to map peptides to bit vectors such that the distance between any two peptides is closely approximated by the Hamming distance (i.e. number of mismatches) between their corresponding bit vectors. We combine these two results with the LSH-ALL-PAIRS-SIM algorithm of Buhler (2002) to produce an improved distance-based algorithm for proteomic comparison. An initial implementation of the improved algorithm exhibits sensitivity within 5% of that of the original LSH-ALL-PAIRS-SIM, while running up to eight times faster. Availability: The source of the code can be found at http://www.eecs.berkeley.edu/~eran/projects/ embed.",
keywords = "Database indexing, Hamming space, Metric embedding, Protein comparison",
author = "E. Halperin and J. Buhler and R. Karp and R. Krauthgamer and B. Westover",
note = "Funding Information: The first, third and fourth authors were supported in part by NSF grants CCR-9820951 and CCR-0121555 and DARPA cooperative agreement F30602-00-2-0601.",
year = "2003",
doi = "10.1093/bioinformatics/btg1016",
language = "אנגלית",
volume = "19",
pages = "i122--i129",
journal = "Bioinformatics",
issn = "1367-4803",
publisher = "Oxford University Press",
number = "SUPPL. 1",
}