1
Assignment 3
September 19, 2021
Assignment 3
Import libraries and define common helper functions
2
Load the records from https://storage.budsc.midwest-datascience.com/data/processed/openflights/routes.jsonl.gz
[5]: [{'airline': {'airline_id': 410,
'name': 'Aerocondor',
'alias': 'ANA All Nippon Airways',
'iata': '2B',
'icao': 'ARD',
'callsign': 'AEROCONDOR',
'country': 'Portugal',
'active': True},
'src_airport': {'airport_id': 2965,
'name': 'Sochi International Airport',
'city': 'Sochi',
'country': 'Russia',
'iata': 'AER',
'icao': 'URSS',
'latitude': 43.449902,
'longitude': 39.9566,
'altitude': 89,
'timezone': 3.0,
'dst': 'N',
'tz_id': 'Europe/Moscow',
'type': 'airport',
'source': 'OurAirports'},
'dst_airport': {'airport_id': 2990,
'name': 'Kazan International Airport',
'city': 'Kazan',
'country': 'Russia',
'iata': 'KZN',
'icao': 'UWKD',
'latitude': 55.606201171875,
'longitude': 49.278701782227,
'altitude': 411,
'timezone': 3.0,
'dst': 'N',
3
'tz_id': 'Europe/Moscow',
'type': 'airport',
'source': 'OurAirports'},
'codeshare': False,
Collecting genson
Using cached genson-1.2.2-py2.py3-none-any.whl
Installing collected packages: genson
Successfully installed genson-1.2.2
{
"$schema": "http://json-schema.org/schema#",
"anyOf": [
{
"type": "object"
},
{
"type": "array",
"items": {
"type": "object",
"properties": {
"airline": {
"type": "object",
"properties": {
"airline_id": {
"type": "integer"
},
"name": {
"type": "string"
},
"alias": {
"type": "string"
},
"iata": {
"type": "string"
},
"icao": {
4
"type": "string"
},
"callsign": {
"type": "string"
},
"country": {
"type": "string"
},
"active": {
"type": "boolean"
}
},
"required": [
"active",
"airline_id",
"alias",
"callsign",
"country",
"iata",
"icao",
"name"
]
},
"src_airport": {
"anyOf": [
{
"type": "null"
},
{
"type": "object",
"properties": {
"airport_id": {
"type": "integer"
},
"name": {
"type": "string"
},
"city": {
"type": "string"
},
"country": {
"type": "string"
},
"iata": {
"type": "string"
},
"icao": {
5
"type": "string"
},
"latitude": {
"type": "number"
},
"longitude": {
"type": "number"
},
"altitude": {
"type": "integer"
},
"timezone": {
"type": "number"
},
"dst": {
"type": "string"
},
"tz_id": {
"type": "string"
},
"type": {
"type": "string"
},
"source": {
"type": "string"
}
},
"required": [
"airport_id",
"altitude",
"city",
"country",
"dst",
"iata",
"icao",
"latitude",
"longitude",
"name",
"source",
"timezone",
"type",
"tz_id"
]
}
]
},
"dst_airport": {
6
"anyOf": [
{
"type": "null"
},
{
"type": "object",
"properties": {
"airport_id": {
"type": "integer"
},
"name": {
"type": "string"
},
"city": {
"type": "string"
},
"country": {
"type": "string"
},
"iata": {
"type": "string"
},
"icao": {
"type": "string"
},
"latitude": {
"type": "number"
},
"longitude": {
"type": "number"
},
"altitude": {
"type": "integer"
},
"timezone": {
"type": "number"
},
"dst": {
"type": "string"
},
"tz_id": {
"type": "string"
},
"type": {
"type": "string"
},
"source": {
7
"type": "string"
}
},
"required": [
"airport_id",
"altitude",
"city",
"country",
"dst",
"iata",
"icao",
"latitude",
"longitude",
"name",
"source",
"timezone",
"type",
"tz_id"
]
}
]
},
"codeshare": {
"type": "boolean"
},
"equipment": {
"type": "array",
"items": {
"type": "string"
}
}
},
"required": [
"airline",
"codeshare",
"dst_airport",
"equipment",
"src_airport"
]
}
}
]
}
8
3.1. a JSON Schema
{'$schema': 'http://json-schema.org/draft-04/schema#', 'type':
'object', 'properties': {'airline': {'type': 'object', 'properties':
{'active': {'type': 'boolean'}, 'airline_id': {'type': 'integer'},
'alias': {'type': 'string'},
'callsign': {'type': 'string'}, 'country': {'type': 'string'}, 'iata':
{'type':
'string'}, 'icao': {'type': 'string'}, 'name': {'type': 'string'}},
'required': ['active', 'airline_id', 'alias', 'callsign', 'country',
'iata', 'icao',
'name']}, 'codeshare': {'type': 'boolean'}, 'dst_airport': {'type':
['object',
'null'], 'properties': {'airport_id': {'type': 'integer'}, 'altitude':
{'type': 'integer'}, 'city': {'type': 'string'}, 'country': {'type':
'string'}, 'dst': {'type': 'string'}, 'iata': {'type': 'string'},
'icao': {'type': 'string'}, 'latitude': {'type': 'number'},
'longitude': {'type': 'number'}, 'name':
{'type': 'string'}, 'source': {'type': 'string'}, 'timezone': {'type':
'number'}, 'type': {'type': 'string'}, 'tz_id': {'type': 'string'}},
'required':
9
['airport_id', 'altitude', 'city', 'country', 'dst', 'iata', 'icao',
'latitude', 'longitude', 'name', 'source', 'timezone', 'type',
'tz_id']}, 'equipment':
{'type': 'array', 'items': [{'type': 'string'}]}, 'src_airport':
{'type':
['object', 'null'], 'properties': {'airport_id': {'type': 'integer'},
'altitude': {'type': 'integer'}, 'city': {'type': 'string'},
'country': {'type':
'string'}, 'dst': {'type': 'string'}, 'iata': {'type': 'string'},
'icao': {'type': 'string'}, 'latitude': {'type': 'number'},
'longitude': {'type':
'number'}, 'name': {'type': 'string'}, 'source': {'type': 'string'},
'timezone':
{'type': 'number'}, 'type': {'type': 'string'}, 'tz_id': {'type':
'string'}},
'required': ['airport_id', 'altitude', 'city', 'country', 'dst',
'iata', 'icao',
'latitude', 'longitude', 'name', 'source', 'timezone', 'type',
'tz_id']}},
'required': ['airline', 'codeshare', 'dst_airport', 'equipment',
'src_airport']}
3.1.b Avro
/home/adil/dsc650/dsc650/assignments/assignment03/results/routes.avro
10
3.1.c Parquet
pyarrow.Table airline: struct<airline_id: int64, name: string,
alias: string, iata: string, icao: string, callsign: string,
country: string, active: bool>
child 0, airline_id:
int64 child 1, name:
string child 2, alias:
string child 3, iata:
string child 4, icao:
string child 5,
callsign: string child
6, country: string
child 7, active: bool
src_airport: struct<airport_id: int64, name: string, city: string,
country: string, iata: string, icao: string, latitude: double,
longitude: double, altitude: int64, timezone: double, dst: string,
tz_id: string, type: string, source: string>
child 0, airport_id:
int64 child 1, name:
string child 2, city:
string child 3,
country: string child
4, iata: string child
5, icao: string child
6, latitude: double
,→
11
child 7, longitude:
double child 8,
altitude: int64 child
9, timezone: double
child 10, dst: string
child 11, tz_id:
string child 12, type:
string child 13,
source: string
dst_airport: struct<airport_id: int64, name: string, city: string,
country: string, iata: string, icao: string, latitude: double,
longitude: double, altitude: int64, timezone: double, dst: string,
tz_id: string, type: string, source: string>
child 0, airport_id:
int64 child 1, name:
string child 2, city:
string child 3,
country: string child
4, iata: string child
5, icao: string child
6, latitude: double
child 7, longitude:
double child 8,
altitude: int64 child
9, timezone: double
child 10, dst: string
child 11, tz_id:
string child 12, type:
string child 13,
source: string
codeshare: bool
equipment: list<item:
string> child 0, item:
string
12
3.1.d Protocol Buffers
13
14
3.2.a Simple Geohash Index
15
3.2.b Simple Search Feature
16
20000.0 km
Eppley
Airfield
3.1 e Output Sizes
[1]: import os routes_avro_size =
os.path.getsize("/home/adil/dsc650/dsc650/assignments/
,→assignment03/results/routes.avro")
print (routes_avro_size, "bytes")
19646227 bytes
[56]: routes_parquet_size = os.path.getsize("/home/adil/dsc650/dsc650/assignments/
,→assignment03/results/routes.parquet"
) print (routes_parquet_size, "bytes")
2327907 bytes
[57]: routes_snappy_size = os.path.getsize("/home/adil/dsc650/dsc650/assignments/
,→assignment03/results/routes.pb.snappy"
) print (routes_snappy_size, "bytes")
3705406 bytes
[58]: routes_pb_size = os.path.getsize("/home/adil/dsc650/dsc650/assignments/
,→assignment03/results/routes.pb")
print (routes_pb_size, "bytes")
22270594 bytes
[59]: routes_JSONSCHEMA_size = os.path.getsize("/home/adil/dsc650/dsc650/
,→assignments/assignment03/schemas/routes-
schema.json") print (routes_JSONSCHEMA_size,
"bytes")
3461 bytes
[60]: routes_JSONSCHEMA_gzsize =
os.path.getsize("/home/adil/dsc650/data/processed/
17
,→openflights/routes.jsonl.gz")
print (routes_JSONSCHEMA_gzsize,
"bytes")
3327145 bytes
[ ]:

More Related Content

PDF
Assignment7.pdf
PDF
Assignment 6.3.pdf
PDF
Assignment 6.2a.pdf
PDF
Assignment 6.1.pdf
PDF
Assignment 5.3.pdf
PDF
Assignment 5.2.pdf
PDF
Assignment 5.1.pdf
PDF
Assignment 4.pdf
Assignment7.pdf
Assignment 6.3.pdf
Assignment 6.2a.pdf
Assignment 6.1.pdf
Assignment 5.3.pdf
Assignment 5.2.pdf
Assignment 5.1.pdf
Assignment 4.pdf
Ad

Assignment 3.pdf

  • 1. 1 Assignment 3 September 19, 2021 Assignment 3 Import libraries and define common helper functions
  • 2. 2 Load the records from https://storage.budsc.midwest-datascience.com/data/processed/openflights/routes.jsonl.gz [5]: [{'airline': {'airline_id': 410, 'name': 'Aerocondor', 'alias': 'ANA All Nippon Airways', 'iata': '2B', 'icao': 'ARD', 'callsign': 'AEROCONDOR', 'country': 'Portugal', 'active': True}, 'src_airport': {'airport_id': 2965, 'name': 'Sochi International Airport', 'city': 'Sochi', 'country': 'Russia', 'iata': 'AER', 'icao': 'URSS', 'latitude': 43.449902, 'longitude': 39.9566, 'altitude': 89, 'timezone': 3.0, 'dst': 'N', 'tz_id': 'Europe/Moscow', 'type': 'airport', 'source': 'OurAirports'}, 'dst_airport': {'airport_id': 2990, 'name': 'Kazan International Airport', 'city': 'Kazan', 'country': 'Russia', 'iata': 'KZN', 'icao': 'UWKD', 'latitude': 55.606201171875, 'longitude': 49.278701782227, 'altitude': 411, 'timezone': 3.0, 'dst': 'N',
  • 3. 3 'tz_id': 'Europe/Moscow', 'type': 'airport', 'source': 'OurAirports'}, 'codeshare': False, Collecting genson Using cached genson-1.2.2-py2.py3-none-any.whl Installing collected packages: genson Successfully installed genson-1.2.2 { "$schema": "http://json-schema.org/schema#", "anyOf": [ { "type": "object" }, { "type": "array", "items": { "type": "object", "properties": { "airline": { "type": "object", "properties": { "airline_id": { "type": "integer" }, "name": { "type": "string" }, "alias": { "type": "string" }, "iata": { "type": "string" }, "icao": {
  • 4. 4 "type": "string" }, "callsign": { "type": "string" }, "country": { "type": "string" }, "active": { "type": "boolean" } }, "required": [ "active", "airline_id", "alias", "callsign", "country", "iata", "icao", "name" ] }, "src_airport": { "anyOf": [ { "type": "null" }, { "type": "object", "properties": { "airport_id": { "type": "integer" }, "name": { "type": "string" }, "city": { "type": "string" }, "country": { "type": "string" }, "iata": { "type": "string" }, "icao": {
  • 5. 5 "type": "string" }, "latitude": { "type": "number" }, "longitude": { "type": "number" }, "altitude": { "type": "integer" }, "timezone": { "type": "number" }, "dst": { "type": "string" }, "tz_id": { "type": "string" }, "type": { "type": "string" }, "source": { "type": "string" } }, "required": [ "airport_id", "altitude", "city", "country", "dst", "iata", "icao", "latitude", "longitude", "name", "source", "timezone", "type", "tz_id" ] } ] }, "dst_airport": {
  • 6. 6 "anyOf": [ { "type": "null" }, { "type": "object", "properties": { "airport_id": { "type": "integer" }, "name": { "type": "string" }, "city": { "type": "string" }, "country": { "type": "string" }, "iata": { "type": "string" }, "icao": { "type": "string" }, "latitude": { "type": "number" }, "longitude": { "type": "number" }, "altitude": { "type": "integer" }, "timezone": { "type": "number" }, "dst": { "type": "string" }, "tz_id": { "type": "string" }, "type": { "type": "string" }, "source": {
  • 7. 7 "type": "string" } }, "required": [ "airport_id", "altitude", "city", "country", "dst", "iata", "icao", "latitude", "longitude", "name", "source", "timezone", "type", "tz_id" ] } ] }, "codeshare": { "type": "boolean" }, "equipment": { "type": "array", "items": { "type": "string" } } }, "required": [ "airline", "codeshare", "dst_airport", "equipment", "src_airport" ] } } ] }
  • 8. 8 3.1. a JSON Schema {'$schema': 'http://json-schema.org/draft-04/schema#', 'type': 'object', 'properties': {'airline': {'type': 'object', 'properties': {'active': {'type': 'boolean'}, 'airline_id': {'type': 'integer'}, 'alias': {'type': 'string'}, 'callsign': {'type': 'string'}, 'country': {'type': 'string'}, 'iata': {'type': 'string'}, 'icao': {'type': 'string'}, 'name': {'type': 'string'}}, 'required': ['active', 'airline_id', 'alias', 'callsign', 'country', 'iata', 'icao', 'name']}, 'codeshare': {'type': 'boolean'}, 'dst_airport': {'type': ['object', 'null'], 'properties': {'airport_id': {'type': 'integer'}, 'altitude': {'type': 'integer'}, 'city': {'type': 'string'}, 'country': {'type': 'string'}, 'dst': {'type': 'string'}, 'iata': {'type': 'string'}, 'icao': {'type': 'string'}, 'latitude': {'type': 'number'}, 'longitude': {'type': 'number'}, 'name': {'type': 'string'}, 'source': {'type': 'string'}, 'timezone': {'type': 'number'}, 'type': {'type': 'string'}, 'tz_id': {'type': 'string'}}, 'required':
  • 9. 9 ['airport_id', 'altitude', 'city', 'country', 'dst', 'iata', 'icao', 'latitude', 'longitude', 'name', 'source', 'timezone', 'type', 'tz_id']}, 'equipment': {'type': 'array', 'items': [{'type': 'string'}]}, 'src_airport': {'type': ['object', 'null'], 'properties': {'airport_id': {'type': 'integer'}, 'altitude': {'type': 'integer'}, 'city': {'type': 'string'}, 'country': {'type': 'string'}, 'dst': {'type': 'string'}, 'iata': {'type': 'string'}, 'icao': {'type': 'string'}, 'latitude': {'type': 'number'}, 'longitude': {'type': 'number'}, 'name': {'type': 'string'}, 'source': {'type': 'string'}, 'timezone': {'type': 'number'}, 'type': {'type': 'string'}, 'tz_id': {'type': 'string'}}, 'required': ['airport_id', 'altitude', 'city', 'country', 'dst', 'iata', 'icao', 'latitude', 'longitude', 'name', 'source', 'timezone', 'type', 'tz_id']}}, 'required': ['airline', 'codeshare', 'dst_airport', 'equipment', 'src_airport']} 3.1.b Avro /home/adil/dsc650/dsc650/assignments/assignment03/results/routes.avro
  • 10. 10 3.1.c Parquet pyarrow.Table airline: struct<airline_id: int64, name: string, alias: string, iata: string, icao: string, callsign: string, country: string, active: bool> child 0, airline_id: int64 child 1, name: string child 2, alias: string child 3, iata: string child 4, icao: string child 5, callsign: string child 6, country: string child 7, active: bool src_airport: struct<airport_id: int64, name: string, city: string, country: string, iata: string, icao: string, latitude: double, longitude: double, altitude: int64, timezone: double, dst: string, tz_id: string, type: string, source: string> child 0, airport_id: int64 child 1, name: string child 2, city: string child 3, country: string child 4, iata: string child 5, icao: string child 6, latitude: double ,→
  • 11. 11 child 7, longitude: double child 8, altitude: int64 child 9, timezone: double child 10, dst: string child 11, tz_id: string child 12, type: string child 13, source: string dst_airport: struct<airport_id: int64, name: string, city: string, country: string, iata: string, icao: string, latitude: double, longitude: double, altitude: int64, timezone: double, dst: string, tz_id: string, type: string, source: string> child 0, airport_id: int64 child 1, name: string child 2, city: string child 3, country: string child 4, iata: string child 5, icao: string child 6, latitude: double child 7, longitude: double child 8, altitude: int64 child 9, timezone: double child 10, dst: string child 11, tz_id: string child 12, type: string child 13, source: string codeshare: bool equipment: list<item: string> child 0, item: string
  • 13. 13
  • 16. 16 20000.0 km Eppley Airfield 3.1 e Output Sizes [1]: import os routes_avro_size = os.path.getsize("/home/adil/dsc650/dsc650/assignments/ ,→assignment03/results/routes.avro") print (routes_avro_size, "bytes") 19646227 bytes [56]: routes_parquet_size = os.path.getsize("/home/adil/dsc650/dsc650/assignments/ ,→assignment03/results/routes.parquet" ) print (routes_parquet_size, "bytes") 2327907 bytes [57]: routes_snappy_size = os.path.getsize("/home/adil/dsc650/dsc650/assignments/ ,→assignment03/results/routes.pb.snappy" ) print (routes_snappy_size, "bytes") 3705406 bytes [58]: routes_pb_size = os.path.getsize("/home/adil/dsc650/dsc650/assignments/ ,→assignment03/results/routes.pb") print (routes_pb_size, "bytes") 22270594 bytes [59]: routes_JSONSCHEMA_size = os.path.getsize("/home/adil/dsc650/dsc650/ ,→assignments/assignment03/schemas/routes- schema.json") print (routes_JSONSCHEMA_size, "bytes") 3461 bytes [60]: routes_JSONSCHEMA_gzsize = os.path.getsize("/home/adil/dsc650/data/processed/