bioinformatics - Efficiently match multiple strings/keywords to multiple texts in R -

i trying efficiently map exact peptides (short sequences of amino acids in 26 character alphabet a-z¹) proteins (longer sequences of same alphabet). efficient way i'm aware of aho-corasick trie (where peptides keywords). unfortunately can't find version of ac in r work non-nucleotide alphabet (biostrings' pdict , starr's match_ac both hard-coded dna).

as crutch i've been trying parallelize basic grep approach. i'm having trouble figuring out way without incurring significant io overhead. here brief example:

peptides = c("fsssgggggggr","gahlqggak","ggsggsyggggsgggygggsgsr","iisnascttnclaplak") if (!exists("proteins")) {   bioclite("biomart", ask=f, suppressupdates=t, suppressautoupdate=t)   library(biomart)   ensembl = usemart("ensembl",dataset="hsapiens_gene_ensembl")   proteins = getbm(attributes=c('peptide', 'refseq_peptide'), filters='refseq_peptide', values=c("np_000217", "np_001276675"), mart=ensembl)   row.names(proteins) = proteins$refseq_peptide }  library(snowfall) library(biostrings) library(plyr) sfinit(parallel=t, cpus=detectcores()-1)  allpeptideinstances = null i=1 increment=100 count=nrow(proteins) while(t) {   print(paste(i, min(count, i+increment), sep=":"))   text_source = proteins[i:min(count, i+increment),]   text = text_source$peptide    #peptideinstances = sapply(peptides, regexpr, text, fixed=t, usebytes=t)   peptideinstances = sfsapply(peptides, regexpr, text, fixed=t, usebytes=t)   dimnames(peptideinstances) = list(text_source$refseq_peptide, colnames(peptideinstances))    sparsepeptideinstances = alply(peptideinstances, 2, .fun = function(x) {x[x > 0]}, .dims = t)    allpeptideinstances = c(allpeptideinstances, sparsepeptideinstances, recursive=t)   if (i==count | nrow(text_source) < increment)     break   = i+increment }  sfstop()

there few issues here:

peptideinstances here dense matrix, returning each worker verbose. have broken blocks i'm not dealing 40,000 (proteins) x 60,000 (peptides) matrix.
parallelizing on peptides, when make more sense parallelize on proteins because they're bigger. got frustrated trying protein because:
this code breaks if there 1 protein in text_source.

alternatively, if aware of better solution in r, i'm happy use that. i've spent enough time on have been better served implementing aho-corasick.

¹ of ambiguity codes, simplicity, ignore that.

i learned rcpp , implemented aho-corasick myself. cran has general purpose multiple-keyword search package.

here usage examples:

listequals = function(a, b) { is.null(unlist(a)) && is.null(unlist(b)) || !is.null(a) && !is.null(b) && all(unlist(a) == unlist(b)) }  # simple search of multiple keywords in single text keywords = c("abra", "cadabra", "is", "the", "magic", "word") onesearch = ahocorasicksearch(keywords, "is abracadabra magic word?") stopifnot(listequals(onesearch[[1]][[1]], list(keyword="abra", offset=4))) stopifnot(listequals(onesearch[[1]][[2]], list(keyword="cadabra", offset=8))) stopifnot(listequals(onesearch[[1]][[3]], list(keyword="the", offset=16))) stopifnot(listequals(onesearch[[1]][[4]], list(keyword="magic", offset=20))) stopifnot(listequals(onesearch[[1]][[5]], list(keyword="word", offset=26)))  # search list of lists # * sublists accessed index # * texts accessed index # * non-matched texts kept (to preserve index order) listsearch = ahocorasicksearchlist(keywords, list(c("what in", "the world"), c("is"), "secret about", "the magic word?")) stopifnot(listequals(listsearch[[1]][[1]], list())) stopifnot(listequals(listsearch[[1]][[2]][[1]], list(keyword="the", offset=1))) stopifnot(listequals(listsearch[[2]][[1]][[1]], list(keyword="is", offset=1))) stopifnot(listequals(listsearch[[3]], list())) stopifnot(listequals(listsearch[[4]][[1]][[1]], list(keyword="the", offset=1))) stopifnot(listequals(listsearch[[4]][[1]][[2]], list(keyword="magic", offset=5))) stopifnot(listequals(listsearch[[4]][[1]][[3]], list(keyword="word", offset=11)))  # named search of list of lists # * sublists accessed name # * matched texts accessed name # * non-matched texts dropped namedsearch = ahocorasicksearchlist(keywords, list(subject=c(phrase1="what in", phrase2="the world"),                                                    verb=c(phrase1="is"),                                                    predicate1=c(phrase1="secret about"),                                                    predicate2=c(phrase1="the magic word?"))) stopifnot(listequals(namedsearch$subject$phrase2[[1]], list(keyword="the", offset=1))) stopifnot(listequals(namedsearch$verb$phrase1[[1]], list(keyword="is", offset=1))) stopifnot(listequals(namedsearch$predicate1, list())) stopifnot(listequals(namedsearch$predicate2$phrase1[[1]], list(keyword="the", offset=1))) stopifnot(listequals(namedsearch$predicate2$phrase1[[2]], list(keyword="magic", offset=5))) stopifnot(listequals(namedsearch$predicate2$phrase1[[3]], list(keyword="word", offset=11)))  # named search of multiple texts in single list keyword grouping , aminoacid alphabet # * matches keyword accessed name # * non-matched keywords dropped proteins = c(protein1="peptidepeptidedadadararararakekekekepeptide",              protein2="derpaderpapewpewpeepeerawrawwarragtagpeptidekesequence") peptides = c("peptide", "derpa", "sequence", "keke", "peppie") peptidesearch = ahocorasicksearch(peptides, proteins, alphabet="aminoacid", groupbykeyword=t) stopifnot(listequals(peptidesearch$peptide, list(list(keyword="protein1", offset=1),                                                  list(keyword="protein1", offset=8),                                                  list(keyword="protein1", offset=37),                                                  list(keyword="protein2", offset=38)))) stopifnot(listequals(peptidesearch$derpa, list(list(keyword="protein2", offset=1),                                                list(keyword="protein2", offset=6)))) stopifnot(listequals(peptidesearch$sequence, list(list(keyword="protein2", offset=47)))) stopifnot(listequals(peptidesearch$keke, list(list(keyword="protein1", offset=29),                                               list(keyword="protein1", offset=31),                                               list(keyword="protein1", offset=33)))) stopifnot(listequals(peptidesearch$peppie, null))  # grouping keyword without text names: offsets given without reference text names(proteins) = null peptidesearch = ahocorasicksearch(peptides, proteins, groupbykeyword=t) stopifnot(listequals(peptidesearch$peptide, list(1, 8, 37, 38))) stopifnot(listequals(peptidesearch$derpa, list(1, 6))) stopifnot(listequals(peptidesearch$sequence, list(47))) stopifnot(listequals(peptidesearch$keke, list(29, 31, 33)))

Search This Blog

Jal

bioinformatics - Efficiently match multiple strings/keywords to multiple texts in R -

Comments

Post a Comment

Popular posts from this blog

javascript - Slick Slider width recalculation -

jsf - PrimeFaces Datatable - What is f:facet actually doing? -

angular2 services - Angular 2 RC 4 Http post not firing -