SlideShare a Scribd company logo
The Best IoT Analytics with
MongoDB
Jake Angerman
Sr. Solutions Architect
MongoDB
Sessions:
1. Building an IoT Application that Will Work
Next Year
2. Building IoT Applications the Right Way
3. The Best IoT Analytics with MongoDB
Track Overview
✔
✔
Introduction
#MDBW16
Morpheus: time series data is everywhere
Morpheus picture
#MDBW16
Automatic Dependent Surveillance Broadcast (ADS-B)
Primary radar
Secondary Surveillance Radar
Software defined radio
1090 MHz
1030 MHz
1090 MHz
#MDBW16
Tin Can Reveal
homemade antenna
(6.9mm quarter-wave whip)
NooElec	NESDR	Mini	2	SDR	 $23.00	
USB	extension	cable	 $10.00	
RF	cable	RG316	female	to	MCX	male	 $5.50	
?n	can	 $2.87	
Total:	 $41.37	
6.9cm antenna
USB SDR
dump1090
#MDBW16
dump1090
#MDBW16
Antenna Range approximately 250 miles (400km)
> db.tincan.aggregate( [{ $geoNear: { near: { type: "Point", coordinates: [ center_y, center_x ] }, distanceField:
"meters", minDistance: 394289, limit: 100, spherical: true }}, {$sort: { "meters": -1}}, {$limit: 1} ])
#MDBW16
Antenna Range approximately 250 miles (400km)
> db.tincan.aggregate( [{ $geoNear: { near: { type: "Point", coordinates: [ center_y, center_x ] }, distanceField:
"meters", minDistance: 394289, limit: 100, spherical: true }}, {$sort: { "meters": -1}}, {$limit: 1} ])
#MDBW16
ADS-B BaseStation data format
MSG,7,111,11111,A3DC34,111111,2016/03/28,21:42:25.875,2016/03/28,21:42:25.865,,36975,,,,,,,,,,0
MSG,7,111,11111,A3DC34,111111,2016/03/28,21:42:25.884,2016/03/28,21:42:25.865,,36975,,,,,,,,,,0
MSG,8,111,11111,A33AA7,111111,2016/03/28,21:42:25.898,2016/03/28,21:42:25.865,,,,,,,,,,,,0
MSG,5,111,11111,A33AA7,111111,2016/03/28,21:42:25.961,2016/03/28,21:42:25.931,,28225,,,,,,,0,,0,0
MSG,3,111,11111,A678EF,111111,2016/03/28,21:42:26.013,2016/03/28,21:42:25.996,,34000,,,30.58369,-98.75438,,,,,,0
MSG,4,111,11111,A678EF,111111,2016/03/28,21:42:26.013,2016/03/28,21:42:25.996,,,417,283,,,0,,,,,0
MSG,3,111,11111,0D081C,111111,2016/03/28,21:42:26.280,2016/03/28,21:42:26.258,,35975,,,29.86456,-98.24018,,,,,,0
MSG,4,111,11111,0D081C,111111,2016/03/28,21:42:26.280,2016/03/28,21:42:26.258,,,429,206,,,0,,,,,0
MSG,8,111,11111,0D0648,111111,2016/03/28,21:42:26.358,2016/03/28,21:42:26.324,,,,,,,,,,,,0
MSG,3,111,11111,A678EF,111111,2016/03/28,21:42:26.454,2016/03/28,21:42:26.390,,34000,,,30.58389,-98.75544,,,,,,0
MSG,8,111,11111,A33AA7,111111,2016/03/28,21:42:26.478,2016/03/28,21:42:26.455,,,,,,,,,,,,0
MSG,7,111,11111,A678EF,111111,2016/03/28,21:42:26.679,2016/03/28,21:42:26.651,,34000,,,,,,,,,,0
MSG,7,111,11111,0D081C,111111,2016/03/28,21:42:26.759,2016/03/28,21:42:26.717,,35975,,,,,,,,,,0
altitudeICAO hex
lat/long
date & time
stamp
message
type
speed
#MDBW16
ADS-B in JSON
{
"timestamp" : ISODate("2016-01-31T20:54:35.000+0000"),
"icao" : "AC4144",
"callsign" : "N889WM",
"altitude" : 9350,
"bearing" : 150,
"position" : [-98.62762, 30.03657],
"ground_speed" : 152,
"vertical_rate" : 192
}
#MDBW16
dump1090
dump1090 data flow
Linked List in
RAM
HTTP
:8080
BaseStation
TCP
:30003
[{"hex":"ac741c", "squawk":"6234", "flight":"AAL2417 ",
"lat": 30.619176, "lon":-97.755963, "validposition":1,
"altitude":35975, "vert_rate":0,"track":202, "validtrack":1,
"speed":438, "messages":557, "seen":0}]
AJAX JSON
#MDBW16
dump1090
dump1090 data flow
Linked List in
RAM
HTTP
:8080
BaseStation
TCP
:30003
[{"hex":"ac741c", "squawk":"6234", "flight":"AAL2417 ",
"lat": 30.619176, "lon":-97.755963, "validposition":1,
"altitude":35975, "vert_rate":0,"track":202, "validtrack":1,
"speed":438, "messages":557, "seen":0}]
AJAX JSON
ingest.py
MSG,7,111,11111,A3DC34,111111,2016/03/28,
21:42:25.875,2016/03/28,21:42:25.865,,36975
MongoDB
TCP
#MDBW16
What Types of Analytics Can We Do?
•  Real-time dashboards (<1 second latency) = Aggregation framework
•  Ad-hoc queries = Aggregation framework
•  Historical Reports = Aggregation framework or BI Connector
•  Batch processing = Hadoop
•  Machine Learning = Spark
#MDBW16
Analytics without Data Migration
Database
Historical
Analysis
Devices
Dashboards
DB
DB
ETL
ETL
#MDBW16
Analytics without Data Migration
Database
Historical
Analysis
Devices
DB
DB
ETL
ETL
Dashboards
#MDBW16
Analytics without Data Migration
Database
Historical
Analysis
Devices
Dashboards
•  No bulk or incremental ETL required
•  One language for both real-time and ad-hoc queries
#MDBW16
replica set
Workload Isolation
Historical
Analysis
Devices
Dashboards
primary
secondary
secondary
Aggregation Framework
#MDBW16
Aggregation framework
#MDBW16
dump1090
dump1090 dashboard
Linked List in
RAM
HTTP
:8080
BaseStation
TCP
:30003
[{"hex":"ac741c", "squawk":"6234", "flight":"AAL2417 ",
"lat":30.619176, "lon":-97.755963, "validposition":1,
"altitude":35975, "vert_rate":0,"track":202, "validtrack":1,
"speed":438, "messages":557, "seen":0}]
AJAX JSON
ingest.py
MSG,7,111,11111,A3DC34,111111,2016/03/28,
21:42:25.875,2016/03/28,21:42:25.865,,36975
MongoDB
TCP
WT cache
#MDBW16
Real-time Dashboards
•  Current Radar, last 5 minutes' worth of aircraft data
•  pipeline = [
{"$match": {"t": {"$gte": datetime.datetime.utcnow() - datetime.timedelta(minutes=5) }}},
{"$sort": { "icao":1, "t":1 }},
{"$group": {"_id" : {"icao": "$icao"},
"events": {"$push": {"flight":"$callsign", "altitude":"$a", "track":"$b",
"speed":"$s", "lon": { "$arrayElemAt":["$p", 0] },
"lat": { "$arrayElemAt":["$p", 1] }, "vert_rate":"$v" }},
"sum": {"$sum":1}}},
{"$project" :{ "_id":0, "icao":"$_id.icao", "events":"$events", "sum":"$sum"  }}  ]
$match first uses index
pre-built array avoids
clumsy looping in
application
#MDBW16
Ad hoc aggregations
Which aircraft has the most observations?
> db.tincan.aggregate([
{ $group: {
_id: "$icao",
"sum": {$sum: 1},
"callsigns": {"$addToSet": "$callsign"} }},
{ $sort: { "sum": -1 }},
{$limit: 1}
])
{
"_id": ObjectId("5755..."),
"icao": "ADE201",
"callsign": "N994FE",
"a": 8600,
"b": 104,
"p": [-98.99888, 30.93031],
"s": 164,
"t": ISODate("2016-02-09T02:33:01Z"),
}
#MDBW16
Which aircraft has the most observations?
"result": [
{
"_id": "ADE201",
"sum": 14373,
"callsigns": [
"N994FE"
]
}
{
"_id": ObjectId("5755..."),
"icao": "ADE201",
"callsign": "N994FE",
"a": 8600,
"b": 104,
"p": [-98.99888, 30.93031],
"s": 164,
"t": ISODate("2016-02-09T02:33:01Z"),
}
#MDBW16
ICAO aircraft collection
$ mongoimport -d adsb -c aircraft --type csv --headerline aircraft_db.csv
icao,regid,mdl,type,operator
000334,PU-PLS,ULAC,EDRA SUPER PETREL LS,PRIVATE OWNER
000D77,PU-VGA,WT9,WT-9 DYNAMIC,PRIVATE OWNER
000D82,PU-DCT,WT9,AEROSPOOL WT9 DYNAMIC,PRIVATE OWNER
001100,-,320,UNKNOWN / VARIOUS,CODE USED BY SEVERAL AIRCRAFT
001108,EJC-1108,AC90,GULFSTREAM 690D,EJERCITO DE COLOMBIA
001411,PU-BGC,RV9,AMATEUR VANS RV-9A,PRIVATE OWNER
002008,LV-S004,P208,TECNAM P-2008,PRIVATE OWNER
003106,PU-FUA,ULAC,AMATEUR GFLY,PRIVATE OWNER
004003,Z-WPB,B732,BOEING 737-2N0,AIR ZIMBABWE
...
#MDBW16
$lookup to find aircraft model
> db.tincan.aggregate([
{ $group: {
_id: "$icao",
"sum": {$sum: 1},
"callsigns": {"$addToSet": "$callsign"} }},
{ $sort: { "sum": -1 }},
{ $limit: 1 },
{ $lookup: {
from:"aircraft",
localField:"_id",
foreignField:"icao",
as:"description" }}
])
#MDBW16
$lookup to find aircraft model
"result": [
{
"_id": "ADE201",
"sum": 14373,
"callsigns": [
"N994FE"
],
"description": [
{
"_id": ObjectId("575074300cf625050f2e730e"),
"icao": "ADE201",
"regid": "N994FE",
"mdl": "C208",
"type": "CESSNA 208B GRAND CARAVAN"
}
]
#MDBW16
FedEx
#MDBW16
Which aircraft is seen the most number of days?
> db.tincan.aggregate([
{ $group: {
_id: {icao: "$icao", dayOfYear: {$dateToString: { format: "%Y%m%d",
date: "$t"}}}}},
{$group:{
_id: "$_id.icao",
sum: { $sum: 1 }}},
{ $sort:{ "sum": -1 }},
{ $limit: 1 },
{ $lookup: {
from:"aircraft",
localField:"_id",
foreignField:"icao",
as:"description" }}
])
#MDBW16
Which aircraft is seen the most number of days?
  "result": [
    {
      "_id": "A35969",
      "sum": 63,
      "description": [
        {
          "_id": ObjectId("5762e9cf6ecfc147a0503894"),
          "icao": "A35969",
          "regid": "N315AE",
          "mdl": "B06",
          "type": "BELL 206L-1 LONGRANGER II",
          "operator": "AIR EVAC EMS"
        }
      ]
#MDBW16
Business Intelligence Connector
#MDBW16
BI Connector
•  New in MongoDB 3.2 Enterprise Advanced
•  Mapping and transformation layer
•  Projects smaller parts of large data sets for reporting
#MDBW16
MongoDB Query LanguageSQL
BI Connector Data flow
MongoDB	
BI	
Connector	
Mapping	
metadata	
ApplicaAon	data	
{name:

“Andrew”,
address:
{street:
…}}
Document	Table	AnalyAcs	&	visualizaAon
#MDBW16
FedEx N994FE Flight Paths
#MDBW16
Observations per Operator
#MDBW16
Altitude vs Speed
•  Two predictable clusters:
•  turbine aircraft at cruising
altitude
•  piston aircraft at lower
altitude
#MDBW16
Altitude vs Speed
•  Two predictable clusters:
•  turbine aircraft at cruising
altitude
•  piston aircraft at lower
altitude
#MDBW16
Altitude vs Speed
•  Two predictable clusters:
•  turbine aircraft at cruising
altitude
•  piston aircraft at lower
altitude
•  Outliers are Cessnas
reporting 51,000+ ft
Spark
#MDBW16
Spark Overview
•  fast, general data processing engine
•  interactive shell
•  Scala, Java, Python
•  machine learning libraries (mllib)
•  supports streaming
•  HDFS not required
#MDBW16
Spark Connector
Connector
BSON Files
MapReduce & HDFS
#MDBW16
Spark Connector Diagram
•  diagram
MongoDB Connector for Hadoop (with Spark Plug-in)
https://github.com/mongodb/mongo-hadoop
MongoDB Connector for Spark
https://github.com/mongodb/mongo-spark
#MDBW16
Supervised Unsupervised
Classification
•  Naive Bayes
•  Support Vector
Machines
•  Random Decision
Forests
Clustering
•  K-means
Regression
•  Linear
•  Logistic
Dimensionality
Reduction
•  Principal Component
Analysis
•  Singular Value
Decomposition
Spark Machine Learning
#MDBW16
K-Means Clustering
The K-Means algorithm aims to
minimize the sum of squares of the
distance between the points and the
centroid of each cluster.
source: Lovro Iliassich, toptal.com
#MDBW16
K-Means Clustering
>>> mongo_rdd = sc.mongoRDD('mongodb://localhost:27017/adsb.tincan')
OR specify a filter:
>>> input_conf = {"mongo.job.input.format":
"com.mongodb.hadoop.MongoInputFormat", "mongo.input.uri": "mongodb://
localhost:27017/adsb.tincan", "mongo.input.query": '{"t":{"$lte":{"$date":
1455494400000}}}' }
>>> mongo_rdd = sc.newAPIHadoopRDD(inputFormatClassName,
keyClassName, valueClassName, None, None, input_conf)
#MDBW16
K-Means Clustering
>>> mongo_rdd = sc.mongoRDD('mongodb://localhost:27017/adsb.tincan')
>>> mongo_rdd.first()
{u'icao': u'A06690', u'a': 11975, u'b': 150, u'_id':
ObjectId('5755bb862355da56d87895cf'), u't': datetime.datetime(2016, 2, 8, 5,
25, 4), u'p': [-98.41437, 30.29066], u's': 285, u'v': -1152}
#MDBW16
K-Means Clustering
>>> mongo_rdd = sc.mongoRDD('mongodb://localhost:27017/adsb.tincan')
>>> mongo_rdd.first()
{u'icao': u'A06690', u'a': 11975, u'b': 150, u'_id':
ObjectId('5755bb862355da56d87895cf'), u't': datetime.datetime(2016, 2, 8, 5,
25, 4), u'p': [-98.41437, 30.29066], u's': 285, u'v': -1152}
>>> parsed_rdd = mongo_rdd.map(parseData)
>>> parsed_rdd.first()
[5, 25, 4, 1, 11975, 150, 285, -1152, -98.14857, 30.92651]
#MDBW16
Choosing K
! = ! − !!
!
!∈!!
!
!!!
	
	0
2,000,000
4,000,000
6,000,000
8,000,000
10,000,000
12,000,000
14,000,000
0 20 40 60 80 100 120 140 160 180 200
k
Within Set Sum of Squared Error
WSSSE
#MDBW16
Standard Scaling
	
! =
! − !
!
	
>>> parsed_rdd.first()
[5, 25, 4, 1, 11975, 150, 285, -1152, -98.14857, 30.92651]
>>> scaled_features.first()
[-1.036, -1.1089, -0.2617, 0.6821, -0.8202, 0.4057, 0.8537, -1.6502, -0.6559, 0.6876]
#MDBW16
K-Means Clustering
>>> k = 10
>>> clusters = KMeans.train(parsed_rdd, k, maxIterations=10, runs=1,
initializationMode="random")
>>> cluster_sizes = parsed_rdd.map(lambda e:
clusters.predict(e)).countByValue()
>>> cluster_sizes
defaultdict(<type 'int'>, {0: 70122, 1: 350890, 2: 118596, 3: 104609, 4:
254759, 5: 175840, 6: 166789, 7: 68309, 8: 147826, 9: 495102})
#MDBW16
Save Results Back to MongoDB def labelData(array):
     result = {}
     result['cluster'] = clusters.predict(array)
     result['daystamp'] = str(array[0])
     result['dayofweek'] = array[1]
     result['hour'] = array[2]
     result['minute'] = array[3]
     result['second'] = array[4]
     result['a'] = array[5]
     result['b'] = array[6]
     result['s'] = array[7]
     result['v'] = array[8]
     result['p'] = [ array[9], array[10] ]
     return result
>>> labeled_rdd = parsed_rdd.map(labelData)
>>> labeled_rdd.saveToMongoDB('mongodb://
localhost:27017/adsb.labeled')
#MDBW16
K-Means Clustering
>>> cluster_sizes
defaultdict(<type 'int'>, {0: 70122, 1: 350890, 2:
118596, 3: 104609, 4: 254759, 5: 175840, 6: 166789,
7: 68309, 8: 147826, 9: 495102})
Hypothesis: largest cluster #9 is cruising altitude
#MDBW16
Hypothesis: largest cluster #9 is cruising altitude
adsb> db.labeled.aggregate([
{$match: {cluster:9}},
{$group: {_id: "summary",
"avg_alt": {$avg:"$a"},
"min_alt": {$min:"$a"},
"max_alt": {$max:"$a"} }}])
#MDBW16
Hypothesis: largest cluster #9 is cruising altitude
  "result": [
    {
      "_id": "summary",
      "avg_alt": 33630,
      "min_alt": 30675,
      "max_alt": 35825
    }
#MDBW16
Anomaly Detection
#MDBW16
Anomaly!
•  Plane appears 12,000ft out of
nowhere
#MDBW16
planefinder.net video
#MDBW16
Don't Worry, He's OK
•  4 days later…
#MDBW16
Summary
MongoDB
Machine
Learning
Devices
Historical
Reporting
Real-time
Dashboard
https://github.com/kerneljake/adsb
#MDBW16
Market Size
$36 Billion
Partners
1,000+
International Offices
15
Global Employees
575+
Downloads Worldwide
15,000,000+
Make a GIANT Impact
www.mongodb.com/careers

More Related Content

PPTX
MongoDB for Time Series Data Part 1: Setting the Stage for Sensor Management
PPTX
MongoDB for Time Series Data
PPTX
How Thermo Fisher Is Reducing Mass Spectrometry Experiment Times from Days to...
PPTX
MongoDB for Time Series Data: Schema Design
PPTX
MongoDB for Time Series Data Part 3: Sharding
PDF
RedisConf18 - Redis and Elasticsearch
PDF
Big Data Expo 2015 - Gigaspaces Making Sense of it all
PPTX
MongoDB + Spring
MongoDB for Time Series Data Part 1: Setting the Stage for Sensor Management
MongoDB for Time Series Data
How Thermo Fisher Is Reducing Mass Spectrometry Experiment Times from Days to...
MongoDB for Time Series Data: Schema Design
MongoDB for Time Series Data Part 3: Sharding
RedisConf18 - Redis and Elasticsearch
Big Data Expo 2015 - Gigaspaces Making Sense of it all
MongoDB + Spring

What's hot (20)

PDF
Webinar: Managing Real Time Risk Analytics with MongoDB
PDF
PPTX
MongoDB for Time Series Data: Setting the Stage for Sensor Management
PPTX
MongoDB for Time Series Data Part 2: Analyzing Time Series Data Using the Agg...
PDF
Data Analytics with Druid
PDF
Aggregated queries with Druid on terrabytes and petabytes of data
PDF
Imply at Apache Druid Meetup in London 1-15-20
PPTX
Webinar: Choosing the Right Shard Key for High Performance and Scale
PPTX
Druid realtime indexing
PPTX
Data Modeling IoT and Time Series data in NoSQL
PDF
MongoDB .local Toronto 2019: MongoDB Atlas Search Deep Dive
PPTX
Joins and Other MongoDB 3.2 Aggregation Enhancements
PDF
Tweaking perfomance on high-load projects_Думанский Дмитрий
PDF
Real-time Analytics with Apache Flink and Druid
PDF
WSO2 Stream Processor: Graphical Editor, HTTP & Message Trace Analytics and m...
PDF
MongoDB .local San Francisco 2020: From SQL to NoSQL -- Changing Your Mindset
PDF
ManetoDB: Key/Value storage, BigData in Open Stack_Сергей Ковалев, Илья Свиридов
PPTX
High Performance Applications with MongoDB
PPTX
MongoDB for Time Series Data: Sharding
PPTX
Programmatic Bidding Data Streams & Druid
Webinar: Managing Real Time Risk Analytics with MongoDB
MongoDB for Time Series Data: Setting the Stage for Sensor Management
MongoDB for Time Series Data Part 2: Analyzing Time Series Data Using the Agg...
Data Analytics with Druid
Aggregated queries with Druid on terrabytes and petabytes of data
Imply at Apache Druid Meetup in London 1-15-20
Webinar: Choosing the Right Shard Key for High Performance and Scale
Druid realtime indexing
Data Modeling IoT and Time Series data in NoSQL
MongoDB .local Toronto 2019: MongoDB Atlas Search Deep Dive
Joins and Other MongoDB 3.2 Aggregation Enhancements
Tweaking perfomance on high-load projects_Думанский Дмитрий
Real-time Analytics with Apache Flink and Druid
WSO2 Stream Processor: Graphical Editor, HTTP & Message Trace Analytics and m...
MongoDB .local San Francisco 2020: From SQL to NoSQL -- Changing Your Mindset
ManetoDB: Key/Value storage, BigData in Open Stack_Сергей Ковалев, Илья Свиридов
High Performance Applications with MongoDB
MongoDB for Time Series Data: Sharding
Programmatic Bidding Data Streams & Druid
Ad

Viewers also liked (20)

PPTX
MongoDB and the Internet of Things
PPTX
Webinar: MongoDB and Analytics: Building Solutions with the MongoDB BI Connector
PDF
MongoDB and the Internet of Things
PPTX
Internet of Things and Big Data: Vision and Concrete Use Cases
PDF
IOT Paris Seminar 2015 - Enabling Transformation to the IOT
PPTX
IOT Paris Seminar 2015 - intro by Yann Aubry
KEY
Thoughts on MongoDB Analytics
PPTX
Social Analytics on MongoDB at MongoNYC
PPT
Klmug presentation - Simple Analytics with MongoDB
PDF
Internet of Things Cologne 2015: Rethinking Global Real-Time Data Integration...
PDF
IOT Paris Seminar 2015 - Connected Objects makers, How to deal with Data?
PDF
Internet of Things Cologne 2015: The Contribution of New Data Storage and Ana...
PPT
MongoDB IoT City Tour STUTTGART: Analysing the Internet of Things. By, Pentaho
PDF
MongoDB World 2016: Number Crush
PPTX
MongoDB World 2016: MongoDB + Google Cloud
PDF
MongoDB Solution for Internet of Things and Big Data
PDF
MongoDB World 2016: From the Polls to the Trolls: Seeing What the World Think...
PDF
Data analysis and visualization with mongo db [mongodb world 2016]
PPTX
Overcoming the Barriers to Blockchain Adoption
PPTX
IOT Paris Seminar 2015 - MAXXING Presentation
MongoDB and the Internet of Things
Webinar: MongoDB and Analytics: Building Solutions with the MongoDB BI Connector
MongoDB and the Internet of Things
Internet of Things and Big Data: Vision and Concrete Use Cases
IOT Paris Seminar 2015 - Enabling Transformation to the IOT
IOT Paris Seminar 2015 - intro by Yann Aubry
Thoughts on MongoDB Analytics
Social Analytics on MongoDB at MongoNYC
Klmug presentation - Simple Analytics with MongoDB
Internet of Things Cologne 2015: Rethinking Global Real-Time Data Integration...
IOT Paris Seminar 2015 - Connected Objects makers, How to deal with Data?
Internet of Things Cologne 2015: The Contribution of New Data Storage and Ana...
MongoDB IoT City Tour STUTTGART: Analysing the Internet of Things. By, Pentaho
MongoDB World 2016: Number Crush
MongoDB World 2016: MongoDB + Google Cloud
MongoDB Solution for Internet of Things and Big Data
MongoDB World 2016: From the Polls to the Trolls: Seeing What the World Think...
Data analysis and visualization with mongo db [mongodb world 2016]
Overcoming the Barriers to Blockchain Adoption
IOT Paris Seminar 2015 - MAXXING Presentation
Ad

Similar to MongoDB World 2016: The Best IoT Analytics with MongoDB (20)

PDF
MongoDB and the Internet of Things
PDF
MongoDB .local London 2019: Best Practices for Working with IoT and Time-seri...
PPTX
Webinar: Realizing the Promise of Machine to Machine (M2M) with MongoDB
PDF
MongoDB .local Paris 2020: Les bonnes pratiques pour travailler avec les donn...
PPTX
MongoDB BI Connector & Tableau
PDF
MongoDB Europe 2016 - Enabling the Internet of Things at Proximus - Belgium's...
PDF
MongoDB .local Houston 2019: Best Practices for Working with IoT and Time-ser...
PDF
A Century Of Weather Data - Midwest.io
PDF
IoT databases - review and challenges - IoT, Hardware & Robotics meetup - onl...
PDF
MongoDB Days Silicon Valley: Winning the Dreamforce Hackathon with MongoDB
PDF
MongoDB SoCal 2020: Best Practices for Working with IoT and Time-series Data
PDF
MongoDB .local Chicago 2019: Best Practices for Working with IoT and Time-ser...
PDF
MongoDB .local Munich 2019: Best Practices for Working with IoT and Time-seri...
PPTX
Jumpstart: Your Introduction to MongoDB
PDF
DDS-to-JSON and DDS Real-time Data Storage with MongoDB
PPTX
MongoDB IoT City Tour EINDHOVEN: Managing the Database Complexity
PDF
Lab pratico per la progettazione di soluzioni MongoDB in ambito Internet of T...
PPTX
Advanced applications with MongoDB
PPTX
Business Jumpstart: The Right (and Wrong) Use Cases for MongoDB
PPTX
MongoDB Schema Design: Practical Applications and Implications
MongoDB and the Internet of Things
MongoDB .local London 2019: Best Practices for Working with IoT and Time-seri...
Webinar: Realizing the Promise of Machine to Machine (M2M) with MongoDB
MongoDB .local Paris 2020: Les bonnes pratiques pour travailler avec les donn...
MongoDB BI Connector & Tableau
MongoDB Europe 2016 - Enabling the Internet of Things at Proximus - Belgium's...
MongoDB .local Houston 2019: Best Practices for Working with IoT and Time-ser...
A Century Of Weather Data - Midwest.io
IoT databases - review and challenges - IoT, Hardware & Robotics meetup - onl...
MongoDB Days Silicon Valley: Winning the Dreamforce Hackathon with MongoDB
MongoDB SoCal 2020: Best Practices for Working with IoT and Time-series Data
MongoDB .local Chicago 2019: Best Practices for Working with IoT and Time-ser...
MongoDB .local Munich 2019: Best Practices for Working with IoT and Time-seri...
Jumpstart: Your Introduction to MongoDB
DDS-to-JSON and DDS Real-time Data Storage with MongoDB
MongoDB IoT City Tour EINDHOVEN: Managing the Database Complexity
Lab pratico per la progettazione di soluzioni MongoDB in ambito Internet of T...
Advanced applications with MongoDB
Business Jumpstart: The Right (and Wrong) Use Cases for MongoDB
MongoDB Schema Design: Practical Applications and Implications

More from MongoDB (20)

PDF
MongoDB SoCal 2020: Migrate Anything* to MongoDB Atlas
PDF
MongoDB SoCal 2020: Go on a Data Safari with MongoDB Charts!
PDF
MongoDB SoCal 2020: Using MongoDB Services in Kubernetes: Any Platform, Devel...
PDF
MongoDB SoCal 2020: A Complete Methodology of Data Modeling for MongoDB
PDF
MongoDB SoCal 2020: From Pharmacist to Analyst: Leveraging MongoDB for Real-T...
PDF
MongoDB SoCal 2020: MongoDB Atlas Jump Start
PDF
MongoDB .local San Francisco 2020: Powering the new age data demands [Infosys]
PDF
MongoDB .local San Francisco 2020: Using Client Side Encryption in MongoDB 4.2
PDF
MongoDB .local San Francisco 2020: Using MongoDB Services in Kubernetes: any ...
PDF
MongoDB .local San Francisco 2020: Go on a Data Safari with MongoDB Charts!
PDF
MongoDB .local San Francisco 2020: MongoDB Atlas Jumpstart
PDF
MongoDB .local San Francisco 2020: Tips and Tricks++ for Querying and Indexin...
PDF
MongoDB .local San Francisco 2020: Aggregation Pipeline Power++
PDF
MongoDB .local San Francisco 2020: A Complete Methodology of Data Modeling fo...
PDF
MongoDB .local San Francisco 2020: MongoDB Atlas Data Lake Technical Deep Dive
PDF
MongoDB .local San Francisco 2020: Developing Alexa Skills with MongoDB & Golang
PDF
MongoDB .local Paris 2020: Realm : l'ingrédient secret pour de meilleures app...
PDF
MongoDB .local Paris 2020: Upply @MongoDB : Upply : Quand le Machine Learning...
PDF
MongoDB .local Paris 2020: Les bonnes pratiques pour sécuriser MongoDB
PDF
MongoDB .local Paris 2020: Tout savoir sur le moteur de recherche Full Text S...
MongoDB SoCal 2020: Migrate Anything* to MongoDB Atlas
MongoDB SoCal 2020: Go on a Data Safari with MongoDB Charts!
MongoDB SoCal 2020: Using MongoDB Services in Kubernetes: Any Platform, Devel...
MongoDB SoCal 2020: A Complete Methodology of Data Modeling for MongoDB
MongoDB SoCal 2020: From Pharmacist to Analyst: Leveraging MongoDB for Real-T...
MongoDB SoCal 2020: MongoDB Atlas Jump Start
MongoDB .local San Francisco 2020: Powering the new age data demands [Infosys]
MongoDB .local San Francisco 2020: Using Client Side Encryption in MongoDB 4.2
MongoDB .local San Francisco 2020: Using MongoDB Services in Kubernetes: any ...
MongoDB .local San Francisco 2020: Go on a Data Safari with MongoDB Charts!
MongoDB .local San Francisco 2020: MongoDB Atlas Jumpstart
MongoDB .local San Francisco 2020: Tips and Tricks++ for Querying and Indexin...
MongoDB .local San Francisco 2020: Aggregation Pipeline Power++
MongoDB .local San Francisco 2020: A Complete Methodology of Data Modeling fo...
MongoDB .local San Francisco 2020: MongoDB Atlas Data Lake Technical Deep Dive
MongoDB .local San Francisco 2020: Developing Alexa Skills with MongoDB & Golang
MongoDB .local Paris 2020: Realm : l'ingrédient secret pour de meilleures app...
MongoDB .local Paris 2020: Upply @MongoDB : Upply : Quand le Machine Learning...
MongoDB .local Paris 2020: Les bonnes pratiques pour sécuriser MongoDB
MongoDB .local Paris 2020: Tout savoir sur le moteur de recherche Full Text S...

Recently uploaded (20)

PDF
The Rise and Fall of 3GPP – Time for a Sabbatical?
PPTX
A Presentation on Artificial Intelligence
PDF
Shreyas Phanse Resume: Experienced Backend Engineer | Java • Spring Boot • Ka...
PDF
Building Integrated photovoltaic BIPV_UPV.pdf
PPTX
KOM of Painting work and Equipment Insulation REV00 update 25-dec.pptx
PDF
Machine learning based COVID-19 study performance prediction
PPT
“AI and Expert System Decision Support & Business Intelligence Systems”
PPTX
MYSQL Presentation for SQL database connectivity
PDF
Diabetes mellitus diagnosis method based random forest with bat algorithm
PDF
Spectral efficient network and resource selection model in 5G networks
PDF
Reach Out and Touch Someone: Haptics and Empathic Computing
PDF
NewMind AI Weekly Chronicles - August'25 Week I
PPTX
Effective Security Operations Center (SOC) A Modern, Strategic, and Threat-In...
PDF
Architecting across the Boundaries of two Complex Domains - Healthcare & Tech...
PDF
Per capita expenditure prediction using model stacking based on satellite ima...
PDF
cuic standard and advanced reporting.pdf
PDF
Unlocking AI with Model Context Protocol (MCP)
PDF
NewMind AI Monthly Chronicles - July 2025
PDF
How UI/UX Design Impacts User Retention in Mobile Apps.pdf
PPTX
Digital-Transformation-Roadmap-for-Companies.pptx
The Rise and Fall of 3GPP – Time for a Sabbatical?
A Presentation on Artificial Intelligence
Shreyas Phanse Resume: Experienced Backend Engineer | Java • Spring Boot • Ka...
Building Integrated photovoltaic BIPV_UPV.pdf
KOM of Painting work and Equipment Insulation REV00 update 25-dec.pptx
Machine learning based COVID-19 study performance prediction
“AI and Expert System Decision Support & Business Intelligence Systems”
MYSQL Presentation for SQL database connectivity
Diabetes mellitus diagnosis method based random forest with bat algorithm
Spectral efficient network and resource selection model in 5G networks
Reach Out and Touch Someone: Haptics and Empathic Computing
NewMind AI Weekly Chronicles - August'25 Week I
Effective Security Operations Center (SOC) A Modern, Strategic, and Threat-In...
Architecting across the Boundaries of two Complex Domains - Healthcare & Tech...
Per capita expenditure prediction using model stacking based on satellite ima...
cuic standard and advanced reporting.pdf
Unlocking AI with Model Context Protocol (MCP)
NewMind AI Monthly Chronicles - July 2025
How UI/UX Design Impacts User Retention in Mobile Apps.pdf
Digital-Transformation-Roadmap-for-Companies.pptx

MongoDB World 2016: The Best IoT Analytics with MongoDB

  • 1. The Best IoT Analytics with MongoDB Jake Angerman Sr. Solutions Architect MongoDB
  • 2. Sessions: 1. Building an IoT Application that Will Work Next Year 2. Building IoT Applications the Right Way 3. The Best IoT Analytics with MongoDB Track Overview ✔ ✔
  • 4. #MDBW16 Morpheus: time series data is everywhere Morpheus picture
  • 5. #MDBW16 Automatic Dependent Surveillance Broadcast (ADS-B) Primary radar Secondary Surveillance Radar Software defined radio 1090 MHz 1030 MHz 1090 MHz
  • 6. #MDBW16 Tin Can Reveal homemade antenna (6.9mm quarter-wave whip) NooElec NESDR Mini 2 SDR $23.00 USB extension cable $10.00 RF cable RG316 female to MCX male $5.50 ?n can $2.87 Total: $41.37 6.9cm antenna USB SDR dump1090
  • 8. #MDBW16 Antenna Range approximately 250 miles (400km) > db.tincan.aggregate( [{ $geoNear: { near: { type: "Point", coordinates: [ center_y, center_x ] }, distanceField: "meters", minDistance: 394289, limit: 100, spherical: true }}, {$sort: { "meters": -1}}, {$limit: 1} ])
  • 9. #MDBW16 Antenna Range approximately 250 miles (400km) > db.tincan.aggregate( [{ $geoNear: { near: { type: "Point", coordinates: [ center_y, center_x ] }, distanceField: "meters", minDistance: 394289, limit: 100, spherical: true }}, {$sort: { "meters": -1}}, {$limit: 1} ])
  • 10. #MDBW16 ADS-B BaseStation data format MSG,7,111,11111,A3DC34,111111,2016/03/28,21:42:25.875,2016/03/28,21:42:25.865,,36975,,,,,,,,,,0 MSG,7,111,11111,A3DC34,111111,2016/03/28,21:42:25.884,2016/03/28,21:42:25.865,,36975,,,,,,,,,,0 MSG,8,111,11111,A33AA7,111111,2016/03/28,21:42:25.898,2016/03/28,21:42:25.865,,,,,,,,,,,,0 MSG,5,111,11111,A33AA7,111111,2016/03/28,21:42:25.961,2016/03/28,21:42:25.931,,28225,,,,,,,0,,0,0 MSG,3,111,11111,A678EF,111111,2016/03/28,21:42:26.013,2016/03/28,21:42:25.996,,34000,,,30.58369,-98.75438,,,,,,0 MSG,4,111,11111,A678EF,111111,2016/03/28,21:42:26.013,2016/03/28,21:42:25.996,,,417,283,,,0,,,,,0 MSG,3,111,11111,0D081C,111111,2016/03/28,21:42:26.280,2016/03/28,21:42:26.258,,35975,,,29.86456,-98.24018,,,,,,0 MSG,4,111,11111,0D081C,111111,2016/03/28,21:42:26.280,2016/03/28,21:42:26.258,,,429,206,,,0,,,,,0 MSG,8,111,11111,0D0648,111111,2016/03/28,21:42:26.358,2016/03/28,21:42:26.324,,,,,,,,,,,,0 MSG,3,111,11111,A678EF,111111,2016/03/28,21:42:26.454,2016/03/28,21:42:26.390,,34000,,,30.58389,-98.75544,,,,,,0 MSG,8,111,11111,A33AA7,111111,2016/03/28,21:42:26.478,2016/03/28,21:42:26.455,,,,,,,,,,,,0 MSG,7,111,11111,A678EF,111111,2016/03/28,21:42:26.679,2016/03/28,21:42:26.651,,34000,,,,,,,,,,0 MSG,7,111,11111,0D081C,111111,2016/03/28,21:42:26.759,2016/03/28,21:42:26.717,,35975,,,,,,,,,,0 altitudeICAO hex lat/long date & time stamp message type speed
  • 11. #MDBW16 ADS-B in JSON { "timestamp" : ISODate("2016-01-31T20:54:35.000+0000"), "icao" : "AC4144", "callsign" : "N889WM", "altitude" : 9350, "bearing" : 150, "position" : [-98.62762, 30.03657], "ground_speed" : 152, "vertical_rate" : 192 }
  • 12. #MDBW16 dump1090 dump1090 data flow Linked List in RAM HTTP :8080 BaseStation TCP :30003 [{"hex":"ac741c", "squawk":"6234", "flight":"AAL2417 ", "lat": 30.619176, "lon":-97.755963, "validposition":1, "altitude":35975, "vert_rate":0,"track":202, "validtrack":1, "speed":438, "messages":557, "seen":0}] AJAX JSON
  • 13. #MDBW16 dump1090 dump1090 data flow Linked List in RAM HTTP :8080 BaseStation TCP :30003 [{"hex":"ac741c", "squawk":"6234", "flight":"AAL2417 ", "lat": 30.619176, "lon":-97.755963, "validposition":1, "altitude":35975, "vert_rate":0,"track":202, "validtrack":1, "speed":438, "messages":557, "seen":0}] AJAX JSON ingest.py MSG,7,111,11111,A3DC34,111111,2016/03/28, 21:42:25.875,2016/03/28,21:42:25.865,,36975 MongoDB TCP
  • 14. #MDBW16 What Types of Analytics Can We Do? •  Real-time dashboards (<1 second latency) = Aggregation framework •  Ad-hoc queries = Aggregation framework •  Historical Reports = Aggregation framework or BI Connector •  Batch processing = Hadoop •  Machine Learning = Spark
  • 15. #MDBW16 Analytics without Data Migration Database Historical Analysis Devices Dashboards DB DB ETL ETL
  • 16. #MDBW16 Analytics without Data Migration Database Historical Analysis Devices DB DB ETL ETL Dashboards
  • 17. #MDBW16 Analytics without Data Migration Database Historical Analysis Devices Dashboards •  No bulk or incremental ETL required •  One language for both real-time and ad-hoc queries
  • 21. #MDBW16 dump1090 dump1090 dashboard Linked List in RAM HTTP :8080 BaseStation TCP :30003 [{"hex":"ac741c", "squawk":"6234", "flight":"AAL2417 ", "lat":30.619176, "lon":-97.755963, "validposition":1, "altitude":35975, "vert_rate":0,"track":202, "validtrack":1, "speed":438, "messages":557, "seen":0}] AJAX JSON ingest.py MSG,7,111,11111,A3DC34,111111,2016/03/28, 21:42:25.875,2016/03/28,21:42:25.865,,36975 MongoDB TCP WT cache
  • 22. #MDBW16 Real-time Dashboards •  Current Radar, last 5 minutes' worth of aircraft data •  pipeline = [ {"$match": {"t": {"$gte": datetime.datetime.utcnow() - datetime.timedelta(minutes=5) }}}, {"$sort": { "icao":1, "t":1 }}, {"$group": {"_id" : {"icao": "$icao"}, "events": {"$push": {"flight":"$callsign", "altitude":"$a", "track":"$b", "speed":"$s", "lon": { "$arrayElemAt":["$p", 0] }, "lat": { "$arrayElemAt":["$p", 1] }, "vert_rate":"$v" }}, "sum": {"$sum":1}}}, {"$project" :{ "_id":0, "icao":"$_id.icao", "events":"$events", "sum":"$sum"  }}  ] $match first uses index pre-built array avoids clumsy looping in application
  • 23. #MDBW16 Ad hoc aggregations Which aircraft has the most observations? > db.tincan.aggregate([ { $group: { _id: "$icao", "sum": {$sum: 1}, "callsigns": {"$addToSet": "$callsign"} }}, { $sort: { "sum": -1 }}, {$limit: 1} ]) { "_id": ObjectId("5755..."), "icao": "ADE201", "callsign": "N994FE", "a": 8600, "b": 104, "p": [-98.99888, 30.93031], "s": 164, "t": ISODate("2016-02-09T02:33:01Z"), }
  • 24. #MDBW16 Which aircraft has the most observations? "result": [ { "_id": "ADE201", "sum": 14373, "callsigns": [ "N994FE" ] } { "_id": ObjectId("5755..."), "icao": "ADE201", "callsign": "N994FE", "a": 8600, "b": 104, "p": [-98.99888, 30.93031], "s": 164, "t": ISODate("2016-02-09T02:33:01Z"), }
  • 25. #MDBW16 ICAO aircraft collection $ mongoimport -d adsb -c aircraft --type csv --headerline aircraft_db.csv icao,regid,mdl,type,operator 000334,PU-PLS,ULAC,EDRA SUPER PETREL LS,PRIVATE OWNER 000D77,PU-VGA,WT9,WT-9 DYNAMIC,PRIVATE OWNER 000D82,PU-DCT,WT9,AEROSPOOL WT9 DYNAMIC,PRIVATE OWNER 001100,-,320,UNKNOWN / VARIOUS,CODE USED BY SEVERAL AIRCRAFT 001108,EJC-1108,AC90,GULFSTREAM 690D,EJERCITO DE COLOMBIA 001411,PU-BGC,RV9,AMATEUR VANS RV-9A,PRIVATE OWNER 002008,LV-S004,P208,TECNAM P-2008,PRIVATE OWNER 003106,PU-FUA,ULAC,AMATEUR GFLY,PRIVATE OWNER 004003,Z-WPB,B732,BOEING 737-2N0,AIR ZIMBABWE ...
  • 26. #MDBW16 $lookup to find aircraft model > db.tincan.aggregate([ { $group: { _id: "$icao", "sum": {$sum: 1}, "callsigns": {"$addToSet": "$callsign"} }}, { $sort: { "sum": -1 }}, { $limit: 1 }, { $lookup: { from:"aircraft", localField:"_id", foreignField:"icao", as:"description" }} ])
  • 27. #MDBW16 $lookup to find aircraft model "result": [ { "_id": "ADE201", "sum": 14373, "callsigns": [ "N994FE" ], "description": [ { "_id": ObjectId("575074300cf625050f2e730e"), "icao": "ADE201", "regid": "N994FE", "mdl": "C208", "type": "CESSNA 208B GRAND CARAVAN" } ]
  • 29. #MDBW16 Which aircraft is seen the most number of days? > db.tincan.aggregate([ { $group: { _id: {icao: "$icao", dayOfYear: {$dateToString: { format: "%Y%m%d", date: "$t"}}}}}, {$group:{ _id: "$_id.icao", sum: { $sum: 1 }}}, { $sort:{ "sum": -1 }}, { $limit: 1 }, { $lookup: { from:"aircraft", localField:"_id", foreignField:"icao", as:"description" }} ])
  • 30. #MDBW16 Which aircraft is seen the most number of days?   "result": [     {       "_id": "A35969",       "sum": 63,       "description": [         {           "_id": ObjectId("5762e9cf6ecfc147a0503894"),           "icao": "A35969",           "regid": "N315AE",           "mdl": "B06",           "type": "BELL 206L-1 LONGRANGER II",           "operator": "AIR EVAC EMS"         }       ]
  • 33. #MDBW16 BI Connector •  New in MongoDB 3.2 Enterprise Advanced •  Mapping and transformation layer •  Projects smaller parts of large data sets for reporting
  • 34. #MDBW16 MongoDB Query LanguageSQL BI Connector Data flow MongoDB BI Connector Mapping metadata ApplicaAon data {name:
 “Andrew”, address: {street: …}} Document Table AnalyAcs & visualizaAon
  • 37. #MDBW16 Altitude vs Speed •  Two predictable clusters: •  turbine aircraft at cruising altitude •  piston aircraft at lower altitude
  • 38. #MDBW16 Altitude vs Speed •  Two predictable clusters: •  turbine aircraft at cruising altitude •  piston aircraft at lower altitude
  • 39. #MDBW16 Altitude vs Speed •  Two predictable clusters: •  turbine aircraft at cruising altitude •  piston aircraft at lower altitude •  Outliers are Cessnas reporting 51,000+ ft
  • 40. Spark
  • 41. #MDBW16 Spark Overview •  fast, general data processing engine •  interactive shell •  Scala, Java, Python •  machine learning libraries (mllib) •  supports streaming •  HDFS not required
  • 43. #MDBW16 Spark Connector Diagram •  diagram MongoDB Connector for Hadoop (with Spark Plug-in) https://github.com/mongodb/mongo-hadoop MongoDB Connector for Spark https://github.com/mongodb/mongo-spark
  • 44. #MDBW16 Supervised Unsupervised Classification •  Naive Bayes •  Support Vector Machines •  Random Decision Forests Clustering •  K-means Regression •  Linear •  Logistic Dimensionality Reduction •  Principal Component Analysis •  Singular Value Decomposition Spark Machine Learning
  • 45. #MDBW16 K-Means Clustering The K-Means algorithm aims to minimize the sum of squares of the distance between the points and the centroid of each cluster. source: Lovro Iliassich, toptal.com
  • 46. #MDBW16 K-Means Clustering >>> mongo_rdd = sc.mongoRDD('mongodb://localhost:27017/adsb.tincan') OR specify a filter: >>> input_conf = {"mongo.job.input.format": "com.mongodb.hadoop.MongoInputFormat", "mongo.input.uri": "mongodb:// localhost:27017/adsb.tincan", "mongo.input.query": '{"t":{"$lte":{"$date": 1455494400000}}}' } >>> mongo_rdd = sc.newAPIHadoopRDD(inputFormatClassName, keyClassName, valueClassName, None, None, input_conf)
  • 47. #MDBW16 K-Means Clustering >>> mongo_rdd = sc.mongoRDD('mongodb://localhost:27017/adsb.tincan') >>> mongo_rdd.first() {u'icao': u'A06690', u'a': 11975, u'b': 150, u'_id': ObjectId('5755bb862355da56d87895cf'), u't': datetime.datetime(2016, 2, 8, 5, 25, 4), u'p': [-98.41437, 30.29066], u's': 285, u'v': -1152}
  • 48. #MDBW16 K-Means Clustering >>> mongo_rdd = sc.mongoRDD('mongodb://localhost:27017/adsb.tincan') >>> mongo_rdd.first() {u'icao': u'A06690', u'a': 11975, u'b': 150, u'_id': ObjectId('5755bb862355da56d87895cf'), u't': datetime.datetime(2016, 2, 8, 5, 25, 4), u'p': [-98.41437, 30.29066], u's': 285, u'v': -1152} >>> parsed_rdd = mongo_rdd.map(parseData) >>> parsed_rdd.first() [5, 25, 4, 1, 11975, 150, 285, -1152, -98.14857, 30.92651]
  • 49. #MDBW16 Choosing K ! = ! − !! ! !∈!! ! !!! 0 2,000,000 4,000,000 6,000,000 8,000,000 10,000,000 12,000,000 14,000,000 0 20 40 60 80 100 120 140 160 180 200 k Within Set Sum of Squared Error WSSSE
  • 50. #MDBW16 Standard Scaling ! = ! − ! ! >>> parsed_rdd.first() [5, 25, 4, 1, 11975, 150, 285, -1152, -98.14857, 30.92651] >>> scaled_features.first() [-1.036, -1.1089, -0.2617, 0.6821, -0.8202, 0.4057, 0.8537, -1.6502, -0.6559, 0.6876]
  • 51. #MDBW16 K-Means Clustering >>> k = 10 >>> clusters = KMeans.train(parsed_rdd, k, maxIterations=10, runs=1, initializationMode="random") >>> cluster_sizes = parsed_rdd.map(lambda e: clusters.predict(e)).countByValue() >>> cluster_sizes defaultdict(<type 'int'>, {0: 70122, 1: 350890, 2: 118596, 3: 104609, 4: 254759, 5: 175840, 6: 166789, 7: 68309, 8: 147826, 9: 495102})
  • 52. #MDBW16 Save Results Back to MongoDB def labelData(array):      result = {}      result['cluster'] = clusters.predict(array)      result['daystamp'] = str(array[0])      result['dayofweek'] = array[1]      result['hour'] = array[2]      result['minute'] = array[3]      result['second'] = array[4]      result['a'] = array[5]      result['b'] = array[6]      result['s'] = array[7]      result['v'] = array[8]      result['p'] = [ array[9], array[10] ]      return result >>> labeled_rdd = parsed_rdd.map(labelData) >>> labeled_rdd.saveToMongoDB('mongodb:// localhost:27017/adsb.labeled')
  • 53. #MDBW16 K-Means Clustering >>> cluster_sizes defaultdict(<type 'int'>, {0: 70122, 1: 350890, 2: 118596, 3: 104609, 4: 254759, 5: 175840, 6: 166789, 7: 68309, 8: 147826, 9: 495102}) Hypothesis: largest cluster #9 is cruising altitude
  • 54. #MDBW16 Hypothesis: largest cluster #9 is cruising altitude adsb> db.labeled.aggregate([ {$match: {cluster:9}}, {$group: {_id: "summary", "avg_alt": {$avg:"$a"}, "min_alt": {$min:"$a"}, "max_alt": {$max:"$a"} }}])
  • 55. #MDBW16 Hypothesis: largest cluster #9 is cruising altitude   "result": [     {       "_id": "summary",       "avg_alt": 33630,       "min_alt": 30675,       "max_alt": 35825     }
  • 57. #MDBW16 Anomaly! •  Plane appears 12,000ft out of nowhere
  • 59. #MDBW16 Don't Worry, He's OK •  4 days later…
  • 62. #MDBW16 Market Size $36 Billion Partners 1,000+ International Offices 15 Global Employees 575+ Downloads Worldwide 15,000,000+ Make a GIANT Impact www.mongodb.com/careers