SlideShare a Scribd company logo
analytics
aboutme
whoami...
DavySuvee
@DSUVEE
➡ bigdataarchitect@datablend-continuum
• providebigdataandnosqlconsultancy
• 5yearsofhands-onexpertise
fromdatatoinsights
dataanalyticsinmongodb
chemicalsimilarityuse-case
nativeapi
aggregationframework
map/reduce
chemicalsimilarity(1)
★ 31millioncompoundsavailable
➡ pubchem
➡ Question:
★ findcompoundssimilartoa
particularothercompound
chemicalsimilarity(2)
0[N]1[C	
  O]2[C	
  C	
  C]
0[N]1[C	
  O]2[C	
  C	
  C]3[C	
  C	
  C	
  C	
  C]
0[C]1[C	
  C	
  C]2[C	
  C	
  N	
  O]3[C	
  C	
  C	
  C	
  O	
  O]
0[C]1[C	
  C]2[C	
  C	
  C	
  C	
  O]3[C	
  C	
  N	
  O]
0[O]1[C]2[C	
  O]3[C	
  C	
  C]
0[C]1[C	
  O	
  O]2[C	
  C	
  C	
  O]
0[C]1[C	
  C]2[C	
  C]
0[C]1[C]2[C]3[C	
  O]
0[C]1[C	
  C	
  N]2[C	
  C	
  C	
  C	
  O]3[C	
  C	
  C	
  O]
...
chemicalsimilarity(3)
0[N]1[C	
  O]2[C	
  C	
  C]
0[N]1[C	
  O]2[C	
  C	
  C]3[C	
  C	
  C	
  C	
  C]
0[C]1[C	
  C	
  C]2[C	
  C	
  N	
  O]3[C	
  C	
  C	
  C	
  O	
  O]
0[C]1[C	
  C]2[C	
  C	
  C	
  C	
  O]3[C	
  C	
  N	
  O]
0[O]1[C]2[C	
  O]3[C	
  C	
  C]
0[C]1[C	
  O	
  O]2[C	
  C	
  C	
  O]
0[C]1[C	
  C]2[C	
  C]
0[C]1[C]2[C]3[C	
  O]
0[C]1[C	
  C	
  N]2[C	
  C	
  C	
  C	
  O]3[C	
  C	
  C	
  O]
...
0[N]1[C	
  O]2[C	
  C	
  C]3[C	
  C	
  C	
  C	
  C	
  C]
0[C]1[C	
  C	
  C]2[C	
  C	
  N	
  O]3[C	
  C	
  C	
  C	
  O	
  O]
0[C]1[C	
  C]2[C	
  C	
  C	
  C	
  O]3[C	
  C	
  N	
  O]
0[O]1[C]2[C	
  O]3[C	
  C	
  C	
  C]
0[C]1[C	
  O	
  O]2[C	
  C	
  C	
  O]
0[C]1[C	
  C]2[C	
  C]
0[N]1[C	
  O]2[C	
  C	
  C]
0[C]1[C]2[C]3[C	
  O]
0[C]1[C	
  C	
  N]2[C	
  C	
  C	
  C	
  O]3[C	
  C	
  C	
  O]
...
equalityviatanimoto
but31millioncalculations?
mongodbdatamodel(1)
{	
  
	
  	
  	
  	
  "compound_cid"	
  :	
  "46200001"	
  ,	
  
	
  	
  	
  	
  "smiles"	
  :	
  "CCC1C(C(C(C(=NOCC=CCN2CCCCC2)C(CC(C(C(C(C(C(=O)O1)C)OC3C"	
  ,
	
  	
  	
  	
  "fingerprint_count"	
  :	
  120	
  ,	
  
	
  	
  	
  	
  "fingerprints"	
  :	
  [	
  
	
  	
  	
  	
  	
  	
  	
  	
  "0[N]1[C	
  O]2[C	
  C	
  C]"	
  ,
	
  	
  	
  	
  	
  	
  	
  	
  "0[N]1[C	
  O]2[C	
  C	
  C]3[C	
  C	
  C	
  C	
  C]"	
  ,
	
  	
  	
  	
  	
  	
  	
  	
  "0[C]1[C	
  C	
  C]2[C	
  C	
  N	
  O]3[C	
  C	
  C	
  C	
  O	
  O]"	
  ,
	
  	
  	
  	
  	
  	
  	
  	
  "0[C]1[C	
  C]2[C	
  C	
  C	
  C	
  O]3[C	
  C	
  N	
  O]"	
  ,
	
  	
  	
  	
  	
  	
  	
  	
  "0[O]1[C]2[C	
  O]3[C	
  C	
  C]"	
  ,	
  
	
  	
  	
  	
  	
  	
  	
  	
  "0[C]1[C	
  O	
  O]2[C	
  C	
  C	
  O]"	
  ,	
  
	
  	
  	
  	
  	
  	
  	
  	
  "0[C]1[C	
  C]2[C	
  C]"	
  ,	
  
	
  	
  	
  	
  	
  	
  	
  	
  "0[C]1[C]2[C]3[C	
  O]"	
  ,	
  
	
  	
  	
  	
  	
  	
  	
  	
  "0[C]1[C	
  C	
  N]2[C	
  C	
  C	
  C	
  O]3[C	
  C	
  C	
  O]"	
  ,
	
  	
  	
  	
  	
  	
  	
  	
  ...	
  ]	
  ,	
  
}
compound
collection
mongodbdatamodel(2) fingerprint
collection
{	
  
	
  	
  	
  	
  "fingerprint"	
  :	
  "0[N]1[C	
  O]2[C	
  C	
  C]",
	
  	
  	
  	
  "count"	
  :	
  472
}
{	
  
	
  	
  	
  	
  "fingerprint"	
  :	
  "0[N]1[C	
  O]2[C	
  C	
  C]3[C	
  C	
  C	
  C	
  C]",
	
  	
  	
  	
  "count"	
  :	
  41
}
{
	
  	
  	
  	
  "fingerprint"	
  :	
  "0[O]1[C]2[C	
  O]3[C	
  C	
  C]",
	
  	
  	
  	
  "count"	
  :	
  1343
}
queryingpattern(1)
★ from31million->potentialmatch
➡ narrowdownthesearchspace
➡ imagine80%searchforacompoundwith40features
➡ 32
➡ 50
queryingpattern(2)
★ from31million->potentialmatch
➡ narrowdownthesearchspace
➡ imagine80%searchforacompoundwith40features
(9fingerprints)
findthefingerprints(1)
//	
  Retrieve	
  the	
  particular	
  compound
DBObject	
  object	
  =	
  
compoundsCollection.findOne(QueryBuilder.start("compound_cid").is(compound).get());
//	
  Retrieve	
  the	
  relevant	
  properties
String	
  pubchemcid	
  =	
  (String)object.get(COMPOUNDCID_PROPERTY);
List<Integer>	
  fingerprintstofind	
  =	
  
Arrays.asList(((BasicDBList)object.get(FINGERPRINTS_PROPERTY)).toArray(new	
  
Integer[]{}));
//	
  Sort	
  the	
  fingerprints	
  on	
  total	
  number	
  of	
  occurences
fingerprintstofind	
  =	
  findSortedFingerprints(fingerprintstofind);
findthefingerprints(2)
List<Integer>	
  sortedFingerprintsToFind	
  =	
  new	
  ArrayList<Integer>();
	
  	
  //	
  Find	
  all	
  fingerprint	
  count	
  documents
	
  	
  DBObject	
  fingerprintcountquery	
  =	
  
	
  	
  	
  	
  QueryBuilder.start(FINGERPRINT_PROPERTY).in(fingerprintsToFind.toArray()).get();
	
  	
  
	
  	
  //	
  Only	
  retrieve	
  the	
  fingerprint	
  string	
  itself
	
  	
  DBObject	
  fingerprintcountselection	
  =	
  
	
  	
  	
  	
  QueryBuilder.start(FINGERPRINT_PROPERTY).is(1).get();
	
  	
  	
  	
  	
  	
  	
  	
  
	
  	
  //	
  Sort	
  the	
  result	
  on	
  count
	
  	
  DBObject	
  fingerprintcountsort	
  =	
  QueryBuilder.start(COUNT_PROPERTY).is(1).get();
	
  	
  //	
  Execute	
  the	
  query	
  on	
  the	
  fingerprint	
  counts	
  collection
	
  	
  DBCursor	
  fingerprintcounts	
  =	
  
	
  	
  	
  	
  fingerprintCountsCollection.find(fingerprintcountquery,	
  fingerprintcountselection).
	
  	
  	
  	
  sort(fingerprintcountsort);
nativequery(1)
//	
  Find	
  the	
  matching	
  compounds
DBObject	
  compoundquery	
  =	
  
	
  	
  QueryBuilder.
	
  	
  	
  	
  start(FINGERPRINTS_PROPERTY).
	
  	
  	
  	
  in(fingerprintsToConsider).
	
  	
  	
  	
  and(FINGERPRINTCOUNT_PROPERTY).lessThanEquals(maxnumberofcompoundfingerprints).
	
  	
  	
  	
  and(FINGERPRINTCOUNT_PROPERTY).greaterThanEquals(minnumberofcompoundfingerprints).
	
  	
  	
  	
  get();
nativequery(2)
//	
  Execute	
  the	
  query
DBCursor	
  compounds	
  =	
  compoundsCollection.find(compoundquery);
	
  	
  
//	
  Let's	
  process	
  the	
  found	
  compounds	
  locally
while(compounds.hasNext())	
  {
	
  	
  DBObject	
  compound	
  =	
  compounds.next();
	
  	
  BasicDBList	
  fingerprints	
  =	
  ((BasicDBList)	
  	
  
	
  	
  	
  	
  compound.get(FINGERPRINTS_PROPERTY));
	
  	
  
	
  	
  //	
  Calculate	
  the	
  intersection	
  on	
  the	
  total	
  list	
  of	
  fingerprints
	
  	
  fingerprints.retainAll(fingerprintsToFind);
	
  	
  if	
  (fingerprints.size()	
  >=	
  minnumberofcompoundfingerprints)	
  {
	
  	
  	
  	
  //	
  Calculate	
  the	
  tanimoto	
  coefficient	
  ...
	
  	
  }
}	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  
map/reducequery(1)
map/reducequery(2)
	
  //	
  Find	
  all	
  compounds
DBObject	
  compoundquery	
  =	
  ...	
  
//	
  The	
  map	
  fuction
String	
  map	
  =	
  "function()	
  {	
  	
  "	
  +
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  "	
  	
  var	
  found	
  =	
  0;	
  "	
  +
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  "	
  	
  var	
  fingerprintslength	
  =	
  this.fingerprints.length;	
  "	
  +
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  "	
  	
  for	
  (i	
  =	
  0;	
  i	
  <	
  fingerprintslength;	
  i++)	
  {	
  "	
  +
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  "	
  	
  	
  	
  if	
  (fingerprintstofind[this.fingerprints[i]]	
  ===	
  true)	
  {	
  found++;	
  }	
  "	
  +
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  "	
  	
  }	
  "	
  +
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  "	
  	
  if	
  (found	
  >=	
  minnumberofcompoundfingerprints)	
  {	
  "	
  +
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  "	
  	
  	
  	
  emit	
  (this.compound_cid,	
  {found	
  :	
  found,	
  "	
  +	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  "	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  total	
  :	
  this.fingerprint_count}	
  );	
  }	
  "	
  +
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  "}";
//	
  Execute	
  the	
  map	
  reduce	
  function
MapReduceCommand	
  mr	
  =	
  new	
  MapReduceCommand(compoundsCollection,	
  map,	
  "",	
  
	
  	
  MapReduceCommand.OutputType.INLINE,	
  compoundquery);	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  
aggregationframework(1)
aggregationframework(2)
{	
  "aggregate"	
  :	
  "compounds"	
  ,	
  
	
  	
  "pipeline"	
  :	
  [	
  
	
  	
  	
  	
  	
  {	
  "$match"	
  :	
  {	
  "fingerprint_count"	
  :	
  {	
  "$gte"	
  :	
  4	
  ,	
  "$lte"	
  :	
  1780}}}	
  ,	
  
	
  	
  	
  	
  	
  {	
  "$unwind"	
  :	
  "$fingerprints"}	
  ,	
  
	
  	
  	
  	
  	
  {	
  "$match"	
  :	
  {	
  "fingerprints"	
  :	
  {	
  "$in"	
  :	
  [	
  1960,	
  15111,	
  ...,94	
  ,	
  26]}}}	
  ,	
  	
  
	
  	
  	
  	
  	
  {	
  "$group"	
  :	
  {	
  "_id"	
  :	
  "$compound_cid"	
  ,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  "fingerprintmatches"	
  :	
  {	
  "$sum"	
  :	
  1}	
  ,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  "totalcount"	
  :	
  {	
  "$first"	
  :	
  "$fingerprint_count"}	
  }}}	
  ,	
  	
  
	
  	
  	
  	
  	
  {	
  "$project"	
  :	
  {	
  "_id"	
  :	
  1	
  ,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  "tanimoto"	
  :	
  {	
  "$divide"	
  :	
  [	
  "$fingerprintmatches"	
  ,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  {	
  "$subtract"	
  :	
  [	
  {	
  "$add"	
  :	
  [	
  89	
  ,	
  "$totalcount"]}	
  ,	
  
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  "$fingerprintmatches"]}]}}	
  ,	
  
	
  	
  	
  	
  	
  {	
  "$match"	
  :	
  {	
  "tanimoto"	
  :	
  {	
  "$gte"	
  :	
  0.05}}}]}
	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  	
  
benchmarkresults
★ native->202ms
➡ 100Kcompounds,0.8tanimoto
★ map/reduce->214ms
★ aggregationframework->609ms
★ native->1909ms
➡ 100Kcompounds,0.05tanimoto
★ map/reduce->20978ms
★ aggregationframework->1613ms
diymongodbanalytics...
➡ http://datablend.be/?p=256
➡ thejoyofalgorithmsandnosql:amongodbexample
➡ http://github.com/datablend/mongo-compound-comparison-revisited
Questions?
E-MAIL
info@datablend.be
Followus
twitter.com/data_blend
www.datablend.be
www.datablend.be info@datablend.be 0499/05.00.89
datablend-continuum

More Related Content

PDF
Is HTML5 Ready? (workshop)
PDF
python高级内存管理
PDF
Building Real Time Systems on MongoDB Using the Oplog at Stripe
PDF
MongoDB Europe 2016 - Enabling the Internet of Things at Proximus - Belgium's...
PPTX
Ricky Bobby's World
PDF
Bai Giang 11
PPTX
Fact, Fiction, and FP
PDF
The Ring programming language version 1.5.4 book - Part 59 of 185
Is HTML5 Ready? (workshop)
python高级内存管理
Building Real Time Systems on MongoDB Using the Oplog at Stripe
MongoDB Europe 2016 - Enabling the Internet of Things at Proximus - Belgium's...
Ricky Bobby's World
Bai Giang 11
Fact, Fiction, and FP
The Ring programming language version 1.5.4 book - Part 59 of 185

What's hot (20)

PDF
The Ring programming language version 1.5.1 book - Part 44 of 180
PDF
MongoDBで作るソーシャルデータ新解析基盤
PDF
MongoDB dla administratora
PDF
Pandas+postgre sql 實作 with code
PDF
はじめてのMongoDB
PDF
The Ring programming language version 1.10 book - Part 56 of 212
PDF
Sokoban Game Development Using Java ( Updated using Screenshots & Class Diagr...
PDF
Asssignment2
TXT
PDF
The Ring programming language version 1.7 book - Part 63 of 196
PPTX
Webinarserie: Einführung in MongoDB: “Back to Basics” - Teil 3 - Interaktion ...
PPTX
Webinar: Building Your First App in Node.js
PPTX
Detection of errors and potential vulnerabilities in C and C++ code using the...
PDF
20110514 mongo dbチューニング
DOCX
Binomial heap
PDF
The Ring programming language version 1.5.2 book - Part 66 of 181
PDF
Improved Security Proof for the Camenisch- Lysyanskaya Signature-Based Synchr...
PPT
Mobile Game and Application with J2ME - Collision Detection
PPT
Mobile Game and Application with J2ME
PDF
Exploring Canvas
The Ring programming language version 1.5.1 book - Part 44 of 180
MongoDBで作るソーシャルデータ新解析基盤
MongoDB dla administratora
Pandas+postgre sql 實作 with code
はじめてのMongoDB
The Ring programming language version 1.10 book - Part 56 of 212
Sokoban Game Development Using Java ( Updated using Screenshots & Class Diagr...
Asssignment2
The Ring programming language version 1.7 book - Part 63 of 196
Webinarserie: Einführung in MongoDB: “Back to Basics” - Teil 3 - Interaktion ...
Webinar: Building Your First App in Node.js
Detection of errors and potential vulnerabilities in C and C++ code using the...
20110514 mongo dbチューニング
Binomial heap
The Ring programming language version 1.5.2 book - Part 66 of 181
Improved Security Proof for the Camenisch- Lysyanskaya Signature-Based Synchr...
Mobile Game and Application with J2ME - Collision Detection
Mobile Game and Application with J2ME
Exploring Canvas
Ad

Similar to MongoDB Analytics (20)

PDF
All I know about rsc.io/c2go
PDF
MongoDB With Style
PPTX
How to leverage what's new in MongoDB 3.6
ODP
Mongo db dla administratora
PPTX
Webinar: Applikationsentwicklung mit MongoDB : Teil 5: Reporting & Aggregation
PDF
MongoDB Europe 2016 - Debugging MongoDB Performance
PPTX
A miało być tak... bez wycieków
PDF
Malli: inside data-driven schemas
PPTX
Самые вкусные баги из игрового кода: как ошибаются наши коллеги-программисты ...
PDF
Presto in Treasure Data (presented at db tech showcase Sapporo 2015)
ODP
Beyond php it's not (just) about the code
PDF
MongoDB World 2019: Event Horizon: Meet Albert Einstein As You Move To The Cloud
DOCX
Computer science project work on C++
PDF
Regression and Classification with R
PDF
MongoDB Aggregation Framework in action !
PPTX
Best Bugs from Games: Fellow Programmers' Mistakes
PPTX
MongoDB
PDF
TDC2018SP | Trilha .Net - Novidades do C# 7 e 8
PPTX
1403 app dev series - session 5 - analytics
PDF
Py conkr 20150829_docker-python
All I know about rsc.io/c2go
MongoDB With Style
How to leverage what's new in MongoDB 3.6
Mongo db dla administratora
Webinar: Applikationsentwicklung mit MongoDB : Teil 5: Reporting & Aggregation
MongoDB Europe 2016 - Debugging MongoDB Performance
A miało być tak... bez wycieków
Malli: inside data-driven schemas
Самые вкусные баги из игрового кода: как ошибаются наши коллеги-программисты ...
Presto in Treasure Data (presented at db tech showcase Sapporo 2015)
Beyond php it's not (just) about the code
MongoDB World 2019: Event Horizon: Meet Albert Einstein As You Move To The Cloud
Computer science project work on C++
Regression and Classification with R
MongoDB Aggregation Framework in action !
Best Bugs from Games: Fellow Programmers' Mistakes
MongoDB
TDC2018SP | Trilha .Net - Novidades do C# 7 e 8
1403 app dev series - session 5 - analytics
Py conkr 20150829_docker-python
Ad

More from datablend (7)

PDF
Coalition cocktail - hack the elections
PDF
The Power of Graphs to Analyze Biological Data
PDF
Introduction to Graph Databases @ SAI
PDF
FluxGraph @ GraphDevRoom
PDF
The power of graphs to analyze biological data
PDF
FluxGraph: a time-machine for your graphs
PDF
8 things I like about Datomic
Coalition cocktail - hack the elections
The Power of Graphs to Analyze Biological Data
Introduction to Graph Databases @ SAI
FluxGraph @ GraphDevRoom
The power of graphs to analyze biological data
FluxGraph: a time-machine for your graphs
8 things I like about Datomic

Recently uploaded (20)

PDF
Profit Center Accounting in SAP S/4HANA, S4F28 Col11
PDF
Blue Purple Modern Animated Computer Science Presentation.pdf.pdf
PDF
TokAI - TikTok AI Agent : The First AI Application That Analyzes 10,000+ Vira...
PDF
Spectral efficient network and resource selection model in 5G networks
PDF
Architecting across the Boundaries of two Complex Domains - Healthcare & Tech...
PDF
Network Security Unit 5.pdf for BCA BBA.
PDF
Building Integrated photovoltaic BIPV_UPV.pdf
PDF
How UI/UX Design Impacts User Retention in Mobile Apps.pdf
PDF
cuic standard and advanced reporting.pdf
PDF
The Rise and Fall of 3GPP – Time for a Sabbatical?
PPTX
Spectroscopy.pptx food analysis technology
PDF
Reach Out and Touch Someone: Haptics and Empathic Computing
PDF
KodekX | Application Modernization Development
DOCX
The AUB Centre for AI in Media Proposal.docx
PDF
Electronic commerce courselecture one. Pdf
PDF
7 ChatGPT Prompts to Help You Define Your Ideal Customer Profile.pdf
PPTX
Programs and apps: productivity, graphics, security and other tools
PDF
Review of recent advances in non-invasive hemoglobin estimation
PPTX
Effective Security Operations Center (SOC) A Modern, Strategic, and Threat-In...
PDF
Agricultural_Statistics_at_a_Glance_2022_0.pdf
Profit Center Accounting in SAP S/4HANA, S4F28 Col11
Blue Purple Modern Animated Computer Science Presentation.pdf.pdf
TokAI - TikTok AI Agent : The First AI Application That Analyzes 10,000+ Vira...
Spectral efficient network and resource selection model in 5G networks
Architecting across the Boundaries of two Complex Domains - Healthcare & Tech...
Network Security Unit 5.pdf for BCA BBA.
Building Integrated photovoltaic BIPV_UPV.pdf
How UI/UX Design Impacts User Retention in Mobile Apps.pdf
cuic standard and advanced reporting.pdf
The Rise and Fall of 3GPP – Time for a Sabbatical?
Spectroscopy.pptx food analysis technology
Reach Out and Touch Someone: Haptics and Empathic Computing
KodekX | Application Modernization Development
The AUB Centre for AI in Media Proposal.docx
Electronic commerce courselecture one. Pdf
7 ChatGPT Prompts to Help You Define Your Ideal Customer Profile.pdf
Programs and apps: productivity, graphics, security and other tools
Review of recent advances in non-invasive hemoglobin estimation
Effective Security Operations Center (SOC) A Modern, Strategic, and Threat-In...
Agricultural_Statistics_at_a_Glance_2022_0.pdf

MongoDB Analytics

  • 5. chemicalsimilarity(1) ★ 31millioncompoundsavailable ➡ pubchem ➡ Question: ★ findcompoundssimilartoa particularothercompound
  • 6. chemicalsimilarity(2) 0[N]1[C  O]2[C  C  C] 0[N]1[C  O]2[C  C  C]3[C  C  C  C  C] 0[C]1[C  C  C]2[C  C  N  O]3[C  C  C  C  O  O] 0[C]1[C  C]2[C  C  C  C  O]3[C  C  N  O] 0[O]1[C]2[C  O]3[C  C  C] 0[C]1[C  O  O]2[C  C  C  O] 0[C]1[C  C]2[C  C] 0[C]1[C]2[C]3[C  O] 0[C]1[C  C  N]2[C  C  C  C  O]3[C  C  C  O] ...
  • 7. chemicalsimilarity(3) 0[N]1[C  O]2[C  C  C] 0[N]1[C  O]2[C  C  C]3[C  C  C  C  C] 0[C]1[C  C  C]2[C  C  N  O]3[C  C  C  C  O  O] 0[C]1[C  C]2[C  C  C  C  O]3[C  C  N  O] 0[O]1[C]2[C  O]3[C  C  C] 0[C]1[C  O  O]2[C  C  C  O] 0[C]1[C  C]2[C  C] 0[C]1[C]2[C]3[C  O] 0[C]1[C  C  N]2[C  C  C  C  O]3[C  C  C  O] ... 0[N]1[C  O]2[C  C  C]3[C  C  C  C  C  C] 0[C]1[C  C  C]2[C  C  N  O]3[C  C  C  C  O  O] 0[C]1[C  C]2[C  C  C  C  O]3[C  C  N  O] 0[O]1[C]2[C  O]3[C  C  C  C] 0[C]1[C  O  O]2[C  C  C  O] 0[C]1[C  C]2[C  C] 0[N]1[C  O]2[C  C  C] 0[C]1[C]2[C]3[C  O] 0[C]1[C  C  N]2[C  C  C  C  O]3[C  C  C  O] ... equalityviatanimoto but31millioncalculations?
  • 8. mongodbdatamodel(1) {          "compound_cid"  :  "46200001"  ,          "smiles"  :  "CCC1C(C(C(C(=NOCC=CCN2CCCCC2)C(CC(C(C(C(C(C(=O)O1)C)OC3C"  ,        "fingerprint_count"  :  120  ,          "fingerprints"  :  [                  "0[N]1[C  O]2[C  C  C]"  ,                "0[N]1[C  O]2[C  C  C]3[C  C  C  C  C]"  ,                "0[C]1[C  C  C]2[C  C  N  O]3[C  C  C  C  O  O]"  ,                "0[C]1[C  C]2[C  C  C  C  O]3[C  C  N  O]"  ,                "0[O]1[C]2[C  O]3[C  C  C]"  ,                  "0[C]1[C  O  O]2[C  C  C  O]"  ,                  "0[C]1[C  C]2[C  C]"  ,                  "0[C]1[C]2[C]3[C  O]"  ,                  "0[C]1[C  C  N]2[C  C  C  C  O]3[C  C  C  O]"  ,                ...  ]  ,   } compound collection
  • 9. mongodbdatamodel(2) fingerprint collection {          "fingerprint"  :  "0[N]1[C  O]2[C  C  C]",        "count"  :  472 } {          "fingerprint"  :  "0[N]1[C  O]2[C  C  C]3[C  C  C  C  C]",        "count"  :  41 } {        "fingerprint"  :  "0[O]1[C]2[C  O]3[C  C  C]",        "count"  :  1343 }
  • 10. queryingpattern(1) ★ from31million->potentialmatch ➡ narrowdownthesearchspace ➡ imagine80%searchforacompoundwith40features ➡ 32 ➡ 50
  • 11. queryingpattern(2) ★ from31million->potentialmatch ➡ narrowdownthesearchspace ➡ imagine80%searchforacompoundwith40features (9fingerprints)
  • 12. findthefingerprints(1) //  Retrieve  the  particular  compound DBObject  object  =   compoundsCollection.findOne(QueryBuilder.start("compound_cid").is(compound).get()); //  Retrieve  the  relevant  properties String  pubchemcid  =  (String)object.get(COMPOUNDCID_PROPERTY); List<Integer>  fingerprintstofind  =   Arrays.asList(((BasicDBList)object.get(FINGERPRINTS_PROPERTY)).toArray(new   Integer[]{})); //  Sort  the  fingerprints  on  total  number  of  occurences fingerprintstofind  =  findSortedFingerprints(fingerprintstofind);
  • 13. findthefingerprints(2) List<Integer>  sortedFingerprintsToFind  =  new  ArrayList<Integer>();    //  Find  all  fingerprint  count  documents    DBObject  fingerprintcountquery  =          QueryBuilder.start(FINGERPRINT_PROPERTY).in(fingerprintsToFind.toArray()).get();        //  Only  retrieve  the  fingerprint  string  itself    DBObject  fingerprintcountselection  =          QueryBuilder.start(FINGERPRINT_PROPERTY).is(1).get();                    //  Sort  the  result  on  count    DBObject  fingerprintcountsort  =  QueryBuilder.start(COUNT_PROPERTY).is(1).get();    //  Execute  the  query  on  the  fingerprint  counts  collection    DBCursor  fingerprintcounts  =          fingerprintCountsCollection.find(fingerprintcountquery,  fingerprintcountselection).        sort(fingerprintcountsort);
  • 14. nativequery(1) //  Find  the  matching  compounds DBObject  compoundquery  =      QueryBuilder.        start(FINGERPRINTS_PROPERTY).        in(fingerprintsToConsider).        and(FINGERPRINTCOUNT_PROPERTY).lessThanEquals(maxnumberofcompoundfingerprints).        and(FINGERPRINTCOUNT_PROPERTY).greaterThanEquals(minnumberofcompoundfingerprints).        get();
  • 15. nativequery(2) //  Execute  the  query DBCursor  compounds  =  compoundsCollection.find(compoundquery);     //  Let's  process  the  found  compounds  locally while(compounds.hasNext())  {    DBObject  compound  =  compounds.next();    BasicDBList  fingerprints  =  ((BasicDBList)            compound.get(FINGERPRINTS_PROPERTY));        //  Calculate  the  intersection  on  the  total  list  of  fingerprints    fingerprints.retainAll(fingerprintsToFind);    if  (fingerprints.size()  >=  minnumberofcompoundfingerprints)  {        //  Calculate  the  tanimoto  coefficient  ...    } }                                
  • 17. map/reducequery(2)  //  Find  all  compounds DBObject  compoundquery  =  ...   //  The  map  fuction String  map  =  "function()  {    "  +                          "    var  found  =  0;  "  +                          "    var  fingerprintslength  =  this.fingerprints.length;  "  +                          "    for  (i  =  0;  i  <  fingerprintslength;  i++)  {  "  +                          "        if  (fingerprintstofind[this.fingerprints[i]]  ===  true)  {  found++;  }  "  +                          "    }  "  +                          "    if  (found  >=  minnumberofcompoundfingerprints)  {  "  +                          "        emit  (this.compound_cid,  {found  :  found,  "  +                            "                                                            total  :  this.fingerprint_count}  );  }  "  +                          "}"; //  Execute  the  map  reduce  function MapReduceCommand  mr  =  new  MapReduceCommand(compoundsCollection,  map,  "",      MapReduceCommand.OutputType.INLINE,  compoundquery);                              
  • 19. aggregationframework(2) {  "aggregate"  :  "compounds"  ,      "pipeline"  :  [            {  "$match"  :  {  "fingerprint_count"  :  {  "$gte"  :  4  ,  "$lte"  :  1780}}}  ,            {  "$unwind"  :  "$fingerprints"}  ,            {  "$match"  :  {  "fingerprints"  :  {  "$in"  :  [  1960,  15111,  ...,94  ,  26]}}}  ,              {  "$group"  :  {  "_id"  :  "$compound_cid"  ,                                          "fingerprintmatches"  :  {  "$sum"  :  1}  ,                                          "totalcount"  :  {  "$first"  :  "$fingerprint_count"}  }}}  ,              {  "$project"  :  {  "_id"  :  1  ,                                              "tanimoto"  :  {  "$divide"  :  [  "$fingerprintmatches"  ,                                                                        {  "$subtract"  :  [  {  "$add"  :  [  89  ,  "$totalcount"]}  ,                                                                            "$fingerprintmatches"]}]}}  ,            {  "$match"  :  {  "tanimoto"  :  {  "$gte"  :  0.05}}}]}                              
  • 20. benchmarkresults ★ native->202ms ➡ 100Kcompounds,0.8tanimoto ★ map/reduce->214ms ★ aggregationframework->609ms ★ native->1909ms ➡ 100Kcompounds,0.05tanimoto ★ map/reduce->20978ms ★ aggregationframework->1613ms
  • 21. diymongodbanalytics... ➡ http://datablend.be/?p=256 ➡ thejoyofalgorithmsandnosql:amongodbexample ➡ http://github.com/datablend/mongo-compound-comparison-revisited