SlideShare a Scribd company logo
Better	Visualization	of		Trips	Through					
Agglomerative	Clustering	
Anbarasan S
February 2, 2016
1.	Problem	Statement	:
Visualization of flow/mobility data on a map always gives a cluttered view, even for small
dataset .Hence it is difficult , to derive any insights or make decisions out of it.
2.	Solution:
Devise a methodology to group or aggregate similar flows
3.	Methodology	:
Step 1: K-Nearest Neighbours
1.a. Find the k-nearest neighbors for the Origin location of particular Trip/Flow.
1.b.Find The k-nearest Neighbour For the destination location of particular
Trip/Flow.
Step 2: Contiguous Flows
Two flows/Trips ,are said to be contiguous if and only if, it satisfies both the
following conditions
1. K-NN of Origin1(Trip1) overlaps with k-NN of Origin2(Trip2).
2. K-NN of Destination1(Trip1) overlaps with k-NN of Destination2(Trip2).
Step 3: Agglomerative Clustering
Two flows are clustered in a agglomerative fashion, based on a distance measure
defined by the number of nearest neighbours shared
Dist(Trip1,Trip2) = 1- [KNN(O1) η KNN(O2)/k * KNN(D1) η KNN(D2)/k]
O1,O2- Origins Of Trip1 and Trip2 respectively
D1,D2 destination of Trip1 and Trip2 respectively
Very low value of dist , suggests that the flows are very nearer, and larger value >=1
suggests that flows cannot be clustered together.
Step 4: Visualization
Agglomerative Clusters , when projected on to map, gives meaningful insights
4. Implementation:
Dataset: Taxi Service Trajectory - Prediction Challenge, ECML PKDD 2015 Data Set
https://archive.ics.uci.edu/ml/datasets/Taxi+Service+Trajectory+-
+Prediction+Challenge,+ECML+PKDD+2015
tt_trajectory <- read.csv("G:/TaxiTrajectory/train.csv",
nrows = 500, stringsAsFactors = FALSE)
head(tt_trajectory,1)
## TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP
## 1 1.372637e+18 C NA NA 20000589 137263 6858
## DAY_TYPE MISSING_DATA
## 1 A False
##
POLYLINE
## 1 [[-8.618643,41.141412],[-8.618499,41.141376],[-8.620326,41.14251],[-
8.622153,41.143815],[-8.623953,41.144373],[-8.62668,41.144778],[-
8.627373,41.144697],[-8.630226,41.14521],[-8.632746,41.14692],[-
8.631738,41.148225],[-8.629938,41.150385],[-8.62911,41.151213],[-
8.629128,41.15124],[-8.628786,41.152203],[-8.628687,41.152374],[-
8.628759,41.152518],[-8.630838,41.15268],[-8.632323,41.153022],[-
8.631144,41.154489],[-8.630829,41.154507],[-8.630829,41.154516],[-
8.630829,41.154498],[-8.630838,41.154489]]
4.1.Get	the	Origin	And	Destination	location	for	each	trip
∑ The Polyline field contains the Longitude Latitude location details of a trip taken every
15 secs.
∑ The polyline field is parsed using JSON library , to get the Origin and destination
locations of each trip
library(RJSONIO)
## Warning: package 'RJSONIO' was built under R version 3.2.3
# Function to get Origin and Destination Location of each trip
# using json library
positions_1 <- function(row){
trajectory_list <- fromJSON(row$POLYLINE)
no_of_points <- length(trajectory_list)
if(no_of_points>0){
lan_lat_pairs<-data.frame(rbind(trajectory_list[[1]],
trajectory_list[[no_of_points]]))
return(lan_lat_pairs)
}}
coordinates <- data.frame(TripId=c(), Ordinal=c(),Booking_time= c(), Lat=c(),
Lon=c(), Status=c())
for (i in 1:nrow(tt_trajectory)) {
lat_lon_df <- positions_1(tt_trajectory[i,])
status <- c("Origin","Destination")
if(!is.null(lat_lon_df)){
coordinates <- rbind(coordinates,
data.frame(TripId = tt_trajectory$TRIP_ID[i],
Booking_time = tt_trajectory$TIMESTAMP[i],
Lat = lat_lon_df$X2,
Lon = lat_lon_df$X1,
Status = status ))
}}
coordinates$Status <- factor(coordinates$Status, levels <-
c("Origin","Destination"))
head(coordinates)
## TripId Booking_time Lat Lon Status
## 1 1.372637e+18 1372636858 41.14141 -8.618643 Origin
## 2 1.372637e+18 1372636858 41.15449 -8.630838 Destination
## 3 1.372637e+18 1372637303 41.15983 -8.639847 Origin
## 4 1.372637e+18 1372637303 41.17067 -8.665740 Destination
## 5 1.372637e+18 1372636951 41.14036 -8.612964 Origin
## 6 1.372637e+18 1372636951 41.14053 -8.615970 Destination
4.2	Visualize	the	Trips
Visualize	the	trips	on	to	a	map
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.3
lat.centre = median(coordinates$Lat)
lon.centre = median(coordinates$Lon)
map.portugal <-ggmap(get_map(location = c(lon.centre,lat.centre),
maptype = "roadmap",
source="google",
zoom = 12))
## Map from URL :
http://maps.googleapis.com/maps/api/staticmap?center=41.157607, -
8.615227&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en -
EN&sensor=false
map.portugal+
geom_point(data = coordinates ,aes(x=Lon,y=Lat,color= Status), size= 1)
## Warning: Removed 5 rows containing missing values (geom_point).
# Connect SOurce and Destination for Each trip
map.portugal+
geom_point(data = coordinates ,aes(x=Lon,y=Lat,color= Status), size= 1)+
geom_line(data = coordinates,aes(x=Lon,y=Lat, group = TripId ), color =
"blue")
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missin g values (geom_path).
map.portugal+
geom_line(data = coordinates,aes(x=Lon,y=Lat, group = TripId ),
color = "blue")
## Warning: Removed 5 rows containing missing values (geom_path).
Projection of 1/100 of the original dataset, gives a cluttered view on the map.
4.3.Find	K-NN	for	origin	points	and	destination	points	
separately
4.3.1. Knn for Origin
# Find distance using Haversiandistance
library(dplyr)
library(geosphere)
library(dbscan)
# Function to Convert Ids to TripIds
Ids_to_tripIds<- function(row) lat_lon_origin[row,"TripId"]
# Origin of all the trips
lat_lon_origin <- filter(coordinates,Status == "Origin") %>%
select(TripId,Lat,Lon) %>%
as.matrix()
# Find the actual distance in meters between the origin points
distance_matrix_origin <- distm(lat_lon_origin[,c(3,2)],
fun=distHaversine)
# Find kNN For origin
# No. of. nearest neighbours = 50
knn_origin <- kNN(distance_matrix_origin,k=50)
Origin_NN_ids <- apply(knn_origin$id,1,Ids_to_tripIds)
# Construct nearest neighbour for every origin point
origin_NN <- data.frame(FlowId = c(),Neighbour_Flows = c())
for (i in 1:nrow(lat_lon_origin)) {
origin_NN <- rbind(origin_NN,
data.frame(FlowId = lat_lon_origin[i,"TripId"],
Neighbour_Flows =
Origin_NN_ids[,i]))
}
head(origin_NN)
## FlowId Neighbour_Flows
## 1 1.372637e+18 1.372657e+18
## 2 1.372637e+18 1.372662e+18
## 3 1.372637e+18 1.372638e+18
## 4 1.372637e+18 1.372663e+18
## 5 1.372637e+18 1.372656e+18
## 6 1.372637e+18 1.372642e+18
4.3.2.Knn for destination
# Destination
lat_lon_dest <- filter(coordinates,Status == "Destination")%>%
select(TripId,Lat,Lon) %>% as.matrix()
distance_matrix_dest <- distm(lat_lon_dest[,c(3,2)], fun=distHaversine)
# Find kNN For destination
knn_destination <- kNN(distance_matrix_dest,k=50)
dest_NN_ids <- apply(knn_destination$id,1,Ids_to_tripIds)
dest_NN <- data.frame(FlowId = c(),Neighbour_Flows = c())
for (i in 1:nrow(lat_lon_dest)) {
dest_NN <- rbind(dest_NN,
data.frame(FlowId = lat_lon_dest[i,"TripId"],
Neighbour_Flows = dest_NN_ids[,i]))
}
4.4	Find	Contiguous	Flow	Pair
flow_pair <- data.frame(flow = c(), Neighbour_flow = c(),
snn_distance = c() )
flow_distance <- function(row,o_NN,d_NN){
onn <-
origin_NN[origin_NN$FlowId==row,]$Neighbour_Flows
dnn <-
dest_NN[dest_NN$FlowId==row,]$Neighbour_Flows
common_o_points <- sum(o_NN %in% onn)/50
common_d_points <- sum(d_NN %in% dnn)/50
snn <- 1-(common_o_points*common_d_points)
return(snn)
}
for(i in tt_trajectory$TRIP_ID){
o_NN <- filter(origin_NN,FlowId==i)%>% select(Neighbour_Flows) #nearest
neighbours of given origin
d_NN <- filter(dest_NN,FlowId == i )%>%select(Neighbour_Flows) #NNs of
given destination
NN_matches <- o_NN[,1] %in% d_NN[,1] #Flows having Common Origin and
Common destination
# Contiguous/Nearest Flows for a given flow
# Two Flows are said to be Contiguous if they are in
# Nearest neighbour of both origin and destination of a given flow
contiguous_flows <-o_NN$Neighbour_Flows[NN_matches]
#dist bw Flows
#Arguments Passed:
##1.List of Flows Found to be Contiguous to a given flow
##2.flow_distance -Function to calculate distance between 2 flows
if(length(contiguous_flows)!=0){
snn_flow_distance <-
sapply(contiguous_flows,flow_distance,o_NN[,1],d_NN[,1])
flow_pair <- rbind(flow_pair,
data.frame(flow = i,
Neighbour_flow = contiguous_flows,
snn_distance = snn_flow_distance ))
}
# print(length(contiguous_flows))
# print(length(snn_flow_distance))
}
flow_pair <- flow_pair[flow_pair$snn_distance!=1,]
flow_pair <-
flow_pair[order(flow_pair$flow,flow_pair$snn_distance),]
4.5	Agglomerative	Clustering
## Initialize Clusters
for(i in 1:nrow(tt_trajectory)){
tt_trajectory$Cluster[i] = paste("c",i,sep = "")
## Group Clusters
for(i in 1:nrow(flow_pair)){
flow_1 <- flow_pair$flow[i]
flow_2 <- flow_pair$Neighbour_flow[i]
#Determine the Cluster,the Flows associated with
cluster_label_flow_1 <- tt_trajectory[tt_trajectory$TRIP_ID
==flow_1,]$Cluster
cluster_label_flow_2 <- tt_trajectory[tt_trajectory$TRIP_ID
==flow_2,]$Cluster
#Find Distance Between Clusters
#Step 1:Get All points in a single cluster
flows_with_cluster_1 <- tt_trajectory[tt_trajectory$Cluster ==
cluster_label_flow_1,]$TRIP_ID
flows_with_cluster_2 <- tt_trajectory[tt_trajectory$Cluster ==
cluster_label_flow_2,]$TRIP_ID
#Step 2:Compute Distance Matrix
distance_bw_clusters = data.frame(distances = c())
for(j in flows_with_cluster_1){
for(k in flows_with_cluster_2 ){
rec_in_flowpair <- subset(flow_pair,flow == j &
Neighbour_flow == k)
if(nrow(rec_in_flowpair)!=0){
distance_bw_clusters = rbind(distance_bw_clusters,
data.frame(distances =
rec_in_flowpair$snn_distance))
}
}
}
#Step 3:Condition for grouping Clusters Together -Linkage
Measures
avg_dist_bw_clusters <- mean(distance_bw_clusters$distances)
if(!is.na(avg_dist_bw_clusters) & avg_dist_bw_clusters < 1){
tt_trajectory[tt_trajectory$Cluster ==
cluster_label_flow_2,]$Cluster <-
cluster_label_flow_1
}
}
4.6	Visuaize	the	clusters
cluster_table <- group_by(tt_trajectory,Cluster) %>%
count(Cluster) %>%
filter(n>1)%>% select(Cluster,n)
Cluster_centroids <- data.frame(ClusterId = c(),Lat = c(),Lon =
c(),no_of_flows = c())
for(i in cluster_table$Cluster){
tripids_of_cluster <-
tt_trajectory[tt_trajectory$Cluster==i,]$TRIP_ID
lat_lon_o_df <- data.frame(lat_lon_origin)
cluster_o_positions <- filter(lat_lon_o_df,
lat_lon_o_df$TripId %in%
tripids_of_cluster)%>%select(Lat,Lon)
Cluster_centroids <- rbind(Cluster_centroids,
data.frame(ClusterId=i,
Lat =
apply(cluster_o_positions,2,median)[1],
Lon =
apply(cluster_o_positions,2,median)[2],
no_of_flows =
cluster_table[cluster_table$Cluster==i,]$n))
lat_lon_d_df <- data.frame(lat_lon_dest)
cluster_d_positions <- filter(lat_lon_d_df,
lat_lon_d_df$TripId %in%
tripids_of_cluster) %>% select(Lat,Lon)
Cluster_centroids <- rbind(Cluster_centroids,
data.frame(ClusterId = i,
Lat =
apply(cluster_d_positions,2,median)[1],
Lon =
apply(cluster_d_positions,2,median)[2],
no_of_flows =
cluster_table[cluster_table$Cluster==i,]$n))
}
map.portugal+
geom_line(data = Cluster_centroids,
aes(x=Lon,y=Lat,group =ClusterId, size = no_of_flows,
alpha = 1.5),color = "red")
CONCLUSION
Flow	Clustering	of	trips	,	thus	provides	a	meaningful	insights	about	the	flow	
,the	more	broader	the	clustered	flow,	it	has	more	number	of	actual	flows.On	
increasing	the	number	of	nearest	neighbours	,	the	flow	clusters	gives	a	clear	
picture	of	the	flows.	Due	to	extensive	computation	required	for	such	analysis,	
I	restricted	to	reduced	number	of	trips(500)	and	reduced	number	of	nearest	
neighbours(50)

More Related Content

PDF
ClusterAnalysis
PPTX
Beginning direct3d gameprogramming10_shaderdetail_20160506_jintaeks
PDF
P5 - Routing Protocols
PDF
Drobics, m. 2001: datamining using synergiesbetween self-organising maps and...
PDF
Geolocation on Rails
PDF
Understanding GPS & NMEA Messages and Algo to extract Information from NMEA.
ODP
Nmea Introduction
KEY
R meets Hadoop
ClusterAnalysis
Beginning direct3d gameprogramming10_shaderdetail_20160506_jintaeks
P5 - Routing Protocols
Drobics, m. 2001: datamining using synergiesbetween self-organising maps and...
Geolocation on Rails
Understanding GPS & NMEA Messages and Algo to extract Information from NMEA.
Nmea Introduction
R meets Hadoop

What's hot (19)

PDF
Math426_Project3-1
KEY
RHadoop の紹介
PDF
Geo Spatial Plot using R
ODP
Geospatial Data in R
PDF
Maximum Likelihood Calibration of the Hercules Data Set
PDF
imageCorrectionLinearDiffusion
PDF
Data visualization using the grammar of graphics
PDF
Dplyr and Plyr
PDF
Data visualization with multiple groups using ggplot2
DOCX
CLUSTERGRAM
PPTX
Group01_Project3
PDF
Ggplot2 ch2
PDF
Introduction to spatial data analysis in r
PDF
Geolocation Databases in Ruby on Rails
PDF
Quiz 2
PDF
Move your data (Hans Rosling style) with googleVis + 1 line of R code
PPTX
Optimisation random graph presentation
PDF
STATE SPACE GENERATION FRAMEWORK BASED ON BINARY DECISION DIAGRAM FOR DISTRIB...
Math426_Project3-1
RHadoop の紹介
Geo Spatial Plot using R
Geospatial Data in R
Maximum Likelihood Calibration of the Hercules Data Set
imageCorrectionLinearDiffusion
Data visualization using the grammar of graphics
Dplyr and Plyr
Data visualization with multiple groups using ggplot2
CLUSTERGRAM
Group01_Project3
Ggplot2 ch2
Introduction to spatial data analysis in r
Geolocation Databases in Ruby on Rails
Quiz 2
Move your data (Hans Rosling style) with googleVis + 1 line of R code
Optimisation random graph presentation
STATE SPACE GENERATION FRAMEWORK BASED ON BINARY DECISION DIAGRAM FOR DISTRIB...
Ad

Viewers also liked (6)

PPT
Decomposing Object oriented class
PPT
Data miningpresentation
PPTX
Hierarchical clustering
PPT
Correspondence analysis(step by step)
PDF
Hierarchical Clustering
PDF
The Top Skills That Can Get You Hired in 2017
Decomposing Object oriented class
Data miningpresentation
Hierarchical clustering
Correspondence analysis(step by step)
Hierarchical Clustering
The Top Skills That Can Get You Hired in 2017
Ad

Similar to Better Visualization of Trips through Agglomerative Clustering (20)

DOC
algorithm Unit 3
PPTX
Cab travel time prediction using ensemble models
PDF
Prob-Dist-Toll-Forecast-Uncertainty
DOC
Unit 3 daa
PPTX
Spark - Citi Bike NYC
PDF
Lecture set 5
PPTX
routing algorithm
PPT
ECCV2008: MAP Estimation Algorithms in Computer Vision - Part 2
PPTX
Where in the world
PDF
Supply chain logistics : vehicle routing and scheduling
PDF
2013.11.14 Big Data Workshop Bruno Voisin
PDF
10. R getting spatial
PPTX
Branch and bounding : Data structures
PPTX
IAP presentation-1.pptx
PPT
Page rank
PPTX
Mrongraphs acm-sig-2 (1)
PPS
GEOMETRIC TAMPERING ESTIMATION BY MEANS OF A SIFT-BASED FORENSIC ANALYSIS
PDF
A study on_contrast_and_comparison_between_bellman-ford_algorithm_and_dijkstr...
PDF
Data manipulation with dplyr
PDF
R getting spatial
 
algorithm Unit 3
Cab travel time prediction using ensemble models
Prob-Dist-Toll-Forecast-Uncertainty
Unit 3 daa
Spark - Citi Bike NYC
Lecture set 5
routing algorithm
ECCV2008: MAP Estimation Algorithms in Computer Vision - Part 2
Where in the world
Supply chain logistics : vehicle routing and scheduling
2013.11.14 Big Data Workshop Bruno Voisin
10. R getting spatial
Branch and bounding : Data structures
IAP presentation-1.pptx
Page rank
Mrongraphs acm-sig-2 (1)
GEOMETRIC TAMPERING ESTIMATION BY MEANS OF A SIFT-BASED FORENSIC ANALYSIS
A study on_contrast_and_comparison_between_bellman-ford_algorithm_and_dijkstr...
Data manipulation with dplyr
R getting spatial
 

Better Visualization of Trips through Agglomerative Clustering

  • 1. Better Visualization of Trips Through Agglomerative Clustering Anbarasan S February 2, 2016 1. Problem Statement : Visualization of flow/mobility data on a map always gives a cluttered view, even for small dataset .Hence it is difficult , to derive any insights or make decisions out of it. 2. Solution: Devise a methodology to group or aggregate similar flows 3. Methodology : Step 1: K-Nearest Neighbours 1.a. Find the k-nearest neighbors for the Origin location of particular Trip/Flow. 1.b.Find The k-nearest Neighbour For the destination location of particular Trip/Flow. Step 2: Contiguous Flows Two flows/Trips ,are said to be contiguous if and only if, it satisfies both the following conditions 1. K-NN of Origin1(Trip1) overlaps with k-NN of Origin2(Trip2). 2. K-NN of Destination1(Trip1) overlaps with k-NN of Destination2(Trip2). Step 3: Agglomerative Clustering Two flows are clustered in a agglomerative fashion, based on a distance measure defined by the number of nearest neighbours shared Dist(Trip1,Trip2) = 1- [KNN(O1) η KNN(O2)/k * KNN(D1) η KNN(D2)/k] O1,O2- Origins Of Trip1 and Trip2 respectively D1,D2 destination of Trip1 and Trip2 respectively Very low value of dist , suggests that the flows are very nearer, and larger value >=1 suggests that flows cannot be clustered together.
  • 2. Step 4: Visualization Agglomerative Clusters , when projected on to map, gives meaningful insights 4. Implementation: Dataset: Taxi Service Trajectory - Prediction Challenge, ECML PKDD 2015 Data Set https://archive.ics.uci.edu/ml/datasets/Taxi+Service+Trajectory+- +Prediction+Challenge,+ECML+PKDD+2015 tt_trajectory <- read.csv("G:/TaxiTrajectory/train.csv", nrows = 500, stringsAsFactors = FALSE) head(tt_trajectory,1) ## TRIP_ID CALL_TYPE ORIGIN_CALL ORIGIN_STAND TAXI_ID TIMESTAMP ## 1 1.372637e+18 C NA NA 20000589 137263 6858 ## DAY_TYPE MISSING_DATA ## 1 A False ## POLYLINE ## 1 [[-8.618643,41.141412],[-8.618499,41.141376],[-8.620326,41.14251],[- 8.622153,41.143815],[-8.623953,41.144373],[-8.62668,41.144778],[- 8.627373,41.144697],[-8.630226,41.14521],[-8.632746,41.14692],[- 8.631738,41.148225],[-8.629938,41.150385],[-8.62911,41.151213],[- 8.629128,41.15124],[-8.628786,41.152203],[-8.628687,41.152374],[- 8.628759,41.152518],[-8.630838,41.15268],[-8.632323,41.153022],[- 8.631144,41.154489],[-8.630829,41.154507],[-8.630829,41.154516],[- 8.630829,41.154498],[-8.630838,41.154489]] 4.1.Get the Origin And Destination location for each trip ∑ The Polyline field contains the Longitude Latitude location details of a trip taken every 15 secs. ∑ The polyline field is parsed using JSON library , to get the Origin and destination locations of each trip
  • 3. library(RJSONIO) ## Warning: package 'RJSONIO' was built under R version 3.2.3 # Function to get Origin and Destination Location of each trip # using json library positions_1 <- function(row){ trajectory_list <- fromJSON(row$POLYLINE) no_of_points <- length(trajectory_list) if(no_of_points>0){ lan_lat_pairs<-data.frame(rbind(trajectory_list[[1]], trajectory_list[[no_of_points]])) return(lan_lat_pairs) }} coordinates <- data.frame(TripId=c(), Ordinal=c(),Booking_time= c(), Lat=c(), Lon=c(), Status=c()) for (i in 1:nrow(tt_trajectory)) { lat_lon_df <- positions_1(tt_trajectory[i,]) status <- c("Origin","Destination") if(!is.null(lat_lon_df)){ coordinates <- rbind(coordinates, data.frame(TripId = tt_trajectory$TRIP_ID[i], Booking_time = tt_trajectory$TIMESTAMP[i], Lat = lat_lon_df$X2, Lon = lat_lon_df$X1, Status = status )) }} coordinates$Status <- factor(coordinates$Status, levels <- c("Origin","Destination")) head(coordinates) ## TripId Booking_time Lat Lon Status ## 1 1.372637e+18 1372636858 41.14141 -8.618643 Origin ## 2 1.372637e+18 1372636858 41.15449 -8.630838 Destination ## 3 1.372637e+18 1372637303 41.15983 -8.639847 Origin ## 4 1.372637e+18 1372637303 41.17067 -8.665740 Destination ## 5 1.372637e+18 1372636951 41.14036 -8.612964 Origin ## 6 1.372637e+18 1372636951 41.14053 -8.615970 Destination
  • 4. 4.2 Visualize the Trips Visualize the trips on to a map library(ggmap) ## Warning: package 'ggmap' was built under R version 3.2.3 ## Loading required package: ggplot2 ## Warning: package 'ggplot2' was built under R version 3.2.3 lat.centre = median(coordinates$Lat) lon.centre = median(coordinates$Lon) map.portugal <-ggmap(get_map(location = c(lon.centre,lat.centre), maptype = "roadmap", source="google", zoom = 12)) ## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=41.157607, - 8.615227&zoom=12&size=640x640&scale=2&maptype=roadmap&language=en - EN&sensor=false map.portugal+ geom_point(data = coordinates ,aes(x=Lon,y=Lat,color= Status), size= 1) ## Warning: Removed 5 rows containing missing values (geom_point).
  • 5. # Connect SOurce and Destination for Each trip map.portugal+ geom_point(data = coordinates ,aes(x=Lon,y=Lat,color= Status), size= 1)+ geom_line(data = coordinates,aes(x=Lon,y=Lat, group = TripId ), color = "blue") ## Warning: Removed 5 rows containing missing values (geom_point). ## Warning: Removed 5 rows containing missin g values (geom_path).
  • 6. map.portugal+ geom_line(data = coordinates,aes(x=Lon,y=Lat, group = TripId ), color = "blue") ## Warning: Removed 5 rows containing missing values (geom_path).
  • 7. Projection of 1/100 of the original dataset, gives a cluttered view on the map. 4.3.Find K-NN for origin points and destination points separately 4.3.1. Knn for Origin # Find distance using Haversiandistance library(dplyr) library(geosphere) library(dbscan) # Function to Convert Ids to TripIds Ids_to_tripIds<- function(row) lat_lon_origin[row,"TripId"] # Origin of all the trips lat_lon_origin <- filter(coordinates,Status == "Origin") %>% select(TripId,Lat,Lon) %>% as.matrix() # Find the actual distance in meters between the origin points distance_matrix_origin <- distm(lat_lon_origin[,c(3,2)], fun=distHaversine)
  • 8. # Find kNN For origin # No. of. nearest neighbours = 50 knn_origin <- kNN(distance_matrix_origin,k=50) Origin_NN_ids <- apply(knn_origin$id,1,Ids_to_tripIds) # Construct nearest neighbour for every origin point origin_NN <- data.frame(FlowId = c(),Neighbour_Flows = c()) for (i in 1:nrow(lat_lon_origin)) { origin_NN <- rbind(origin_NN, data.frame(FlowId = lat_lon_origin[i,"TripId"], Neighbour_Flows = Origin_NN_ids[,i])) } head(origin_NN) ## FlowId Neighbour_Flows ## 1 1.372637e+18 1.372657e+18 ## 2 1.372637e+18 1.372662e+18 ## 3 1.372637e+18 1.372638e+18 ## 4 1.372637e+18 1.372663e+18 ## 5 1.372637e+18 1.372656e+18 ## 6 1.372637e+18 1.372642e+18 4.3.2.Knn for destination # Destination lat_lon_dest <- filter(coordinates,Status == "Destination")%>% select(TripId,Lat,Lon) %>% as.matrix() distance_matrix_dest <- distm(lat_lon_dest[,c(3,2)], fun=distHaversine) # Find kNN For destination knn_destination <- kNN(distance_matrix_dest,k=50) dest_NN_ids <- apply(knn_destination$id,1,Ids_to_tripIds) dest_NN <- data.frame(FlowId = c(),Neighbour_Flows = c()) for (i in 1:nrow(lat_lon_dest)) { dest_NN <- rbind(dest_NN, data.frame(FlowId = lat_lon_dest[i,"TripId"],
  • 9. Neighbour_Flows = dest_NN_ids[,i])) } 4.4 Find Contiguous Flow Pair flow_pair <- data.frame(flow = c(), Neighbour_flow = c(), snn_distance = c() ) flow_distance <- function(row,o_NN,d_NN){ onn <- origin_NN[origin_NN$FlowId==row,]$Neighbour_Flows dnn <- dest_NN[dest_NN$FlowId==row,]$Neighbour_Flows common_o_points <- sum(o_NN %in% onn)/50 common_d_points <- sum(d_NN %in% dnn)/50 snn <- 1-(common_o_points*common_d_points) return(snn) } for(i in tt_trajectory$TRIP_ID){ o_NN <- filter(origin_NN,FlowId==i)%>% select(Neighbour_Flows) #nearest neighbours of given origin d_NN <- filter(dest_NN,FlowId == i )%>%select(Neighbour_Flows) #NNs of given destination NN_matches <- o_NN[,1] %in% d_NN[,1] #Flows having Common Origin and Common destination # Contiguous/Nearest Flows for a given flow # Two Flows are said to be Contiguous if they are in # Nearest neighbour of both origin and destination of a given flow contiguous_flows <-o_NN$Neighbour_Flows[NN_matches] #dist bw Flows #Arguments Passed: ##1.List of Flows Found to be Contiguous to a given flow ##2.flow_distance -Function to calculate distance between 2 flows if(length(contiguous_flows)!=0){ snn_flow_distance <- sapply(contiguous_flows,flow_distance,o_NN[,1],d_NN[,1]) flow_pair <- rbind(flow_pair, data.frame(flow = i, Neighbour_flow = contiguous_flows, snn_distance = snn_flow_distance )) } # print(length(contiguous_flows)) # print(length(snn_flow_distance))
  • 10. } flow_pair <- flow_pair[flow_pair$snn_distance!=1,] flow_pair <- flow_pair[order(flow_pair$flow,flow_pair$snn_distance),] 4.5 Agglomerative Clustering ## Initialize Clusters for(i in 1:nrow(tt_trajectory)){ tt_trajectory$Cluster[i] = paste("c",i,sep = "") ## Group Clusters for(i in 1:nrow(flow_pair)){ flow_1 <- flow_pair$flow[i] flow_2 <- flow_pair$Neighbour_flow[i] #Determine the Cluster,the Flows associated with cluster_label_flow_1 <- tt_trajectory[tt_trajectory$TRIP_ID ==flow_1,]$Cluster cluster_label_flow_2 <- tt_trajectory[tt_trajectory$TRIP_ID ==flow_2,]$Cluster #Find Distance Between Clusters #Step 1:Get All points in a single cluster flows_with_cluster_1 <- tt_trajectory[tt_trajectory$Cluster == cluster_label_flow_1,]$TRIP_ID flows_with_cluster_2 <- tt_trajectory[tt_trajectory$Cluster == cluster_label_flow_2,]$TRIP_ID
  • 11. #Step 2:Compute Distance Matrix distance_bw_clusters = data.frame(distances = c()) for(j in flows_with_cluster_1){ for(k in flows_with_cluster_2 ){ rec_in_flowpair <- subset(flow_pair,flow == j & Neighbour_flow == k) if(nrow(rec_in_flowpair)!=0){ distance_bw_clusters = rbind(distance_bw_clusters, data.frame(distances = rec_in_flowpair$snn_distance)) } } } #Step 3:Condition for grouping Clusters Together -Linkage Measures avg_dist_bw_clusters <- mean(distance_bw_clusters$distances) if(!is.na(avg_dist_bw_clusters) & avg_dist_bw_clusters < 1){ tt_trajectory[tt_trajectory$Cluster == cluster_label_flow_2,]$Cluster <- cluster_label_flow_1 } }
  • 12. 4.6 Visuaize the clusters cluster_table <- group_by(tt_trajectory,Cluster) %>% count(Cluster) %>% filter(n>1)%>% select(Cluster,n) Cluster_centroids <- data.frame(ClusterId = c(),Lat = c(),Lon = c(),no_of_flows = c()) for(i in cluster_table$Cluster){ tripids_of_cluster <- tt_trajectory[tt_trajectory$Cluster==i,]$TRIP_ID lat_lon_o_df <- data.frame(lat_lon_origin) cluster_o_positions <- filter(lat_lon_o_df, lat_lon_o_df$TripId %in% tripids_of_cluster)%>%select(Lat,Lon) Cluster_centroids <- rbind(Cluster_centroids, data.frame(ClusterId=i, Lat = apply(cluster_o_positions,2,median)[1], Lon = apply(cluster_o_positions,2,median)[2], no_of_flows = cluster_table[cluster_table$Cluster==i,]$n)) lat_lon_d_df <- data.frame(lat_lon_dest) cluster_d_positions <- filter(lat_lon_d_df, lat_lon_d_df$TripId %in% tripids_of_cluster) %>% select(Lat,Lon)
  • 13. Cluster_centroids <- rbind(Cluster_centroids, data.frame(ClusterId = i, Lat = apply(cluster_d_positions,2,median)[1], Lon = apply(cluster_d_positions,2,median)[2], no_of_flows = cluster_table[cluster_table$Cluster==i,]$n)) } map.portugal+ geom_line(data = Cluster_centroids, aes(x=Lon,y=Lat,group =ClusterId, size = no_of_flows, alpha = 1.5),color = "red")