SlideShare a Scribd company logo
@martin_loetzsch
Dr. Martin Loetzsch
code.talks commerce 2018
Data Warehousing with Python
All the data of the company in one place


Data is
the single source of truth
cleaned up & validated
easy to access
embedded into the organisation
Integration of different domains





















Main challenges
Consistency & correctness
Changeability
Complexity
Transparency
!2
Data warehouse = integrated data
@martin_loetzsch
Nowadays required for running a business
application
databases
events
csv files
apis
reporting
crm
marketing
…
search
pricing
DWH orders
users
products
price 

histories
emails
clicks
…
…
operation

events
Avoid click-tools
hard to debug
hard to change
hard to scale with team size/ data complexity / data volume 

Data pipelines as code
SQL files, python & shell scripts
Structure & content of data warehouse are result of running code 

Easy to debug & inspect
Develop locally, test on staging system, then deploy to production
!3
Make changing and testing things easy
@martin_loetzsch
Apply standard software engineering best practices
Megabytes
Plain scripts
Petabytes
Apache Airflow
In between
Mara
!4
Mara: the BI infrastructure of Project A
@martin_loetzsch
Open source (MIT license)
Example pipeline


pipeline = Pipeline(id='demo', description='A small pipeline ..’)

pipeline.add(
Task(id='ping_localhost', description='Pings localhost',
commands=[RunBash('ping -c 3 localhost')]))
sub_pipeline = Pipeline(id='sub_pipeline', description='Pings ..')
for host in ['google', 'amazon', 'facebook']:
sub_pipeline.add(
Task(id=f'ping_{host}', description=f'Pings {host}',
commands=[RunBash(f'ping -c 3 {host}.com')]))
sub_pipeline.add_dependency('ping_amazon', 'ping_facebook')
sub_pipeline.add(Task(id='ping_foo', description='Pings foo',
commands=[RunBash('ping foo')]),
upstreams=['ping_amazon'])
pipeline.add(sub_pipeline, upstreams=['ping_localhost'])
pipeline.add(Task(id=‘sleep', description='Sleeps for 2 seconds',
commands=[RunBash('sleep 2')]),
upstreams=[‘sub_pipeline’])
!5
ETL pipelines as code
@martin_loetzsch
Pipeline = list of tasks with dependencies between them. Task = list of commands
Target of computation


CREATE TABLE m_dim_next.region (

region_id SMALLINT PRIMARY KEY,

region_name TEXT NOT NULL UNIQUE,

country_id SMALLINT NOT NULL,

country_name TEXT NOT NULL,

_region_name TEXT NOT NULL

);



Do computation and store result in table


WITH raw_region
AS (SELECT DISTINCT
country,

region

FROM m_data.ga_session

ORDER BY country, region)



INSERT INTO m_dim_next.region
SELECT
row_number()
OVER (ORDER BY country, region ) AS region_id,

CASE WHEN (SELECT count(DISTINCT country)
FROM raw_region r2
WHERE r2.region = r1.region) > 1
THEN region || ' / ' || country
ELSE region END AS region_name,
dense_rank() OVER (ORDER BY country) AS country_id,
country AS country_name,
region AS _region_name
FROM raw_region r1;

INSERT INTO m_dim_next.region
VALUES (-1, 'Unknown', -1, 'Unknown', 'Unknown');

Speedup subsequent transformations


SELECT util.add_index(
'm_dim_next', 'region',
column_names := ARRAY ['_region_name', ‘country_name',
'region_id']);



SELECT util.add_index(
'm_dim_next', 'region',
column_names := ARRAY ['country_id', 'region_id']);



ANALYZE m_dim_next.region;
!6
PostgreSQL as a data processing engine
@martin_loetzsch
Leave data in DB, Tables as (intermediate) results of processing steps
Execute query


ExecuteSQL(sql_file_name=“preprocess-ad.sql")
cat app/data_integration/pipelines/facebook/preprocess-ad.sql 
| PGTZ=Europe/Berlin PGOPTIONS=—client-min-messages=warning 
psql --username=mloetzsch --host=localhost --echo-all 
-—no-psqlrc --set ON_ERROR_STOP=on kfz_dwh_etl

Read file


ReadFile(file_name=“country_iso_code.csv",
compression=Compression.NONE,
target_table="os_data.country_iso_code",
mapper_script_file_name=“read-country-iso-codes.py",
delimiter_char=“;")
cat "dwh-data/country_iso_code.csv" 
| .venv/bin/python3.6 "app/data_integration/pipelines/load_data/
read-country-iso-codes.py" 
| PGTZ=Europe/Berlin PGOPTIONS=--client-min-messages=warning 
psql --username=mloetzsch --host=localhost --echo-all 
--no-psqlrc --set ON_ERROR_STOP=on kfz_dwh_etl 
--command="COPY os_data.country_iso_code FROM STDIN WITH CSV
DELIMITER AS ';'"

Copy from other databases


Copy(sql_file_name="pdm/load-product.sql", source_db_alias=“pdm",
target_table=“os_data.product",
replace={"@@db@@": "K24Pdm", "@@dbschema@@": “ps",
"@@client@@": "kfzteile24 GmbH"})

cat app/data_integration/pipelines/load_data/pdm/load-product.sql 
| sed "s/@@db@@/K24Pdm/g;s/@@dbschema@@/ps/g;s/@@client@@/
kfzteile24 GmbH/g" 
| sed 's/$/$/g;s/$/$/g' | (cat && echo ';') 
| (cat && echo ';
go') 
| sqsh -U ***** -P ******* -S ******* -D K24Pdm -m csv 
| PGTZ=Europe/Berlin PGOPTIONS=--client-min-messages=warning 
psql --username=mloetzsch --host=localhost --echo-all 
--no-psqlrc --set ON_ERROR_STOP=on kfz_dwh_etl 
--command="COPY os_data.product FROM STDIN WITH CSV HEADER"
!7
Shell commands as interface to data & DBs
@martin_loetzsch
Nothing is faster than a unix pipe
Read a set of files


pipeline.add(
ParallelReadFile(
id="read_download",
description="Loads PyPI downloads from pre_downloaded csv
files",
file_pattern="*/*/*/pypi/downloads-v1.csv.gz",
read_mode=ReadMode.ONLY_NEW,
compression=Compression.GZIP,
target_table="pypi_data.download",
delimiter_char="t", skip_header=True, csv_format=True,
file_dependencies=read_download_file_dependencies,
date_regex="^(?P<year>d{4})/(?P<month>d{2})/(?
P<day>d{2})/",
partition_target_table_by_day_id=True,
timezone="UTC",
commands_before=[
ExecuteSQL(
sql_file_name="create_download_data_table.sql",
file_dependencies=read_download_file_dependencies)
]))
Split large joins into chunks
pipeline.add(
ParallelExecuteSQL(
id="transform_download",
description="Maps downloads to their dimensions",
sql_statement="SELECT
pypi_tmp.insert_download(@chunk@::SMALLINT);”,
parameter_function=
etl_tools.utils.chunk_parameter_function,
parameter_placeholders=["@chunk@"],
commands_before=[
ExecuteSQL(sql_file_name="transform_download.sql")
]),
upstreams=["preprocess_project_version",
"transform_installer"])
!8
Incremental & parallel processing
@martin_loetzsch
You can’t join all clicks with all customers at once
Runnable app
Integrates PyPI project download stats with 

Github repo events
!9
Try it out: Python project stats data warehouse
@martin_loetzsch
https://github.com/mara/mara-example-project
!10
Refer us a data person, earn 200€
@martin_loetzsch
Also analysts, developers, product managers
Thank you
@martin_loetzsch
!11

More Related Content

PPTX
NOSQL vs SQL
PDF
Data Lineage with Apache Airflow using Marquez
PPT
Database concepts
PDF
Building robust CDC pipeline with Apache Hudi and Debezium
PPTX
Introduction to HDFS
PPTX
Sql vs NoSQL
PDF
Spark SQL
PPTX
Unit III Key-Value Based Databases in nosql.pptx
NOSQL vs SQL
Data Lineage with Apache Airflow using Marquez
Database concepts
Building robust CDC pipeline with Apache Hudi and Debezium
Introduction to HDFS
Sql vs NoSQL
Spark SQL
Unit III Key-Value Based Databases in nosql.pptx

What's hot (20)

PDF
Apache Spark Core – Practical Optimization
PPTX
Windows Azure Blob Storage
PDF
Architect’s Open-Source Guide for a Data Mesh Architecture
PDF
Tuning Apache Spark for Large-Scale Workloads Gaoxiang Liu and Sital Kedia
PPTX
Introduction to MongoDB
PDF
Designing ETL Pipelines with Structured Streaming and Delta Lake—How to Archi...
PDF
Making Data Timelier and More Reliable with Lakehouse Technology
PPTX
Introduction to NoSQL Databases
PPT
9. Document Oriented Databases
PPTX
Introduction to spark
PPT
Schemaless Databases
PPTX
Data Lake Overview
PDF
Intro to Delta Lake
PPTX
File Organization
PDF
Apache Spark in Depth: Core Concepts, Architecture & Internals
PDF
Apache Kafka® Use Cases for Financial Services
PDF
Introduction to ETL and Data Integration
PPTX
introduction to NOSQL Database
DOCX
Sql vs NO-SQL database differences explained
PDF
Building large scale transactional data lake using apache hudi
Apache Spark Core – Practical Optimization
Windows Azure Blob Storage
Architect’s Open-Source Guide for a Data Mesh Architecture
Tuning Apache Spark for Large-Scale Workloads Gaoxiang Liu and Sital Kedia
Introduction to MongoDB
Designing ETL Pipelines with Structured Streaming and Delta Lake—How to Archi...
Making Data Timelier and More Reliable with Lakehouse Technology
Introduction to NoSQL Databases
9. Document Oriented Databases
Introduction to spark
Schemaless Databases
Data Lake Overview
Intro to Delta Lake
File Organization
Apache Spark in Depth: Core Concepts, Architecture & Internals
Apache Kafka® Use Cases for Financial Services
Introduction to ETL and Data Integration
introduction to NOSQL Database
Sql vs NO-SQL database differences explained
Building large scale transactional data lake using apache hudi
Ad

Similar to Data Warehousing with Python (20)

PDF
Lightweight ETL pipelines with mara (PyData Berlin September Meetup)
PDF
Data infrastructure for the other 90% of companies
PDF
Best Practices for Building and Deploying Data Pipelines in Apache Spark
DOCX
Informatica
PDF
Data Quality in the Data Hub with RedPointGlobal
PPTX
UNIT-3 Data Visualization for the life used...
PPTX
IRMUK-SOA_for_MDM_DQ_Integration_DV_20min
PDF
Air Line Management System | DBMS project
PPT
Evolutionary db development
PPTX
Experience SQL Server 2017: The Modern Data Platform
DOC
Informatica Interview Questions & Answers
PPTX
[DSC Europe 22] Smart approach in development and deployment process for vari...
PPTX
Data science | What is Data science
PDF
Arun Mathew Thomas_resume
PDF
jagadeesh updated
PPTX
Mastering MapReduce: MapReduce for Big Data Management and Analysis
DOCX
Ajith_kumar_4.3 Years_Informatica_ETL
PPT
Hw09 Hadoop Based Data Mining Platform For The Telecom Industry
PPTX
Alteryx Presentation
PPTX
Get up to Speed (Quick Guide to data.table in R and Pentaho PDI)
Lightweight ETL pipelines with mara (PyData Berlin September Meetup)
Data infrastructure for the other 90% of companies
Best Practices for Building and Deploying Data Pipelines in Apache Spark
Informatica
Data Quality in the Data Hub with RedPointGlobal
UNIT-3 Data Visualization for the life used...
IRMUK-SOA_for_MDM_DQ_Integration_DV_20min
Air Line Management System | DBMS project
Evolutionary db development
Experience SQL Server 2017: The Modern Data Platform
Informatica Interview Questions & Answers
[DSC Europe 22] Smart approach in development and deployment process for vari...
Data science | What is Data science
Arun Mathew Thomas_resume
jagadeesh updated
Mastering MapReduce: MapReduce for Big Data Management and Analysis
Ajith_kumar_4.3 Years_Informatica_ETL
Hw09 Hadoop Based Data Mining Platform For The Telecom Industry
Alteryx Presentation
Get up to Speed (Quick Guide to data.table in R and Pentaho PDI)
Ad

Recently uploaded (20)

PPTX
Computer network topology notes for revision
PDF
annual-report-2024-2025 original latest.
PDF
Foundation of Data Science unit number two notes
PPTX
MODULE 8 - DISASTER risk PREPAREDNESS.pptx
PPTX
Microsoft-Fabric-Unifying-Analytics-for-the-Modern-Enterprise Solution.pptx
PPT
Quality review (1)_presentation of this 21
PPTX
ALIMENTARY AND BILIARY CONDITIONS 3-1.pptx
PDF
Lecture1 pattern recognition............
PPTX
Business Ppt On Nestle.pptx huunnnhhgfvu
PDF
TRAFFIC-MANAGEMENT-AND-ACCIDENT-INVESTIGATION-WITH-DRIVING-PDF-FILE.pdf
PPTX
Database Infoormation System (DBIS).pptx
PPTX
Qualitative Qantitative and Mixed Methods.pptx
PDF
Galatica Smart Energy Infrastructure Startup Pitch Deck
PDF
Fluorescence-microscope_Botany_detailed content
PPT
ISS -ESG Data flows What is ESG and HowHow
PDF
Mega Projects Data Mega Projects Data
PDF
“Getting Started with Data Analytics Using R – Concepts, Tools & Case Studies”
PPTX
climate analysis of Dhaka ,Banglades.pptx
PPTX
Supervised vs unsupervised machine learning algorithms
PPTX
1_Introduction to advance data techniques.pptx
Computer network topology notes for revision
annual-report-2024-2025 original latest.
Foundation of Data Science unit number two notes
MODULE 8 - DISASTER risk PREPAREDNESS.pptx
Microsoft-Fabric-Unifying-Analytics-for-the-Modern-Enterprise Solution.pptx
Quality review (1)_presentation of this 21
ALIMENTARY AND BILIARY CONDITIONS 3-1.pptx
Lecture1 pattern recognition............
Business Ppt On Nestle.pptx huunnnhhgfvu
TRAFFIC-MANAGEMENT-AND-ACCIDENT-INVESTIGATION-WITH-DRIVING-PDF-FILE.pdf
Database Infoormation System (DBIS).pptx
Qualitative Qantitative and Mixed Methods.pptx
Galatica Smart Energy Infrastructure Startup Pitch Deck
Fluorescence-microscope_Botany_detailed content
ISS -ESG Data flows What is ESG and HowHow
Mega Projects Data Mega Projects Data
“Getting Started with Data Analytics Using R – Concepts, Tools & Case Studies”
climate analysis of Dhaka ,Banglades.pptx
Supervised vs unsupervised machine learning algorithms
1_Introduction to advance data techniques.pptx

Data Warehousing with Python

  • 1. @martin_loetzsch Dr. Martin Loetzsch code.talks commerce 2018 Data Warehousing with Python
  • 2. All the data of the company in one place 
 Data is the single source of truth cleaned up & validated easy to access embedded into the organisation Integration of different domains
 
 
 
 
 
 
 
 
 
 
 Main challenges Consistency & correctness Changeability Complexity Transparency !2 Data warehouse = integrated data @martin_loetzsch Nowadays required for running a business application databases events csv files apis reporting crm marketing … search pricing DWH orders users products price 
 histories emails clicks … … operation
 events
  • 3. Avoid click-tools hard to debug hard to change hard to scale with team size/ data complexity / data volume 
 Data pipelines as code SQL files, python & shell scripts Structure & content of data warehouse are result of running code 
 Easy to debug & inspect Develop locally, test on staging system, then deploy to production !3 Make changing and testing things easy @martin_loetzsch Apply standard software engineering best practices Megabytes Plain scripts Petabytes Apache Airflow In between Mara
  • 4. !4 Mara: the BI infrastructure of Project A @martin_loetzsch Open source (MIT license)
  • 5. Example pipeline 
 pipeline = Pipeline(id='demo', description='A small pipeline ..’)
 pipeline.add( Task(id='ping_localhost', description='Pings localhost', commands=[RunBash('ping -c 3 localhost')])) sub_pipeline = Pipeline(id='sub_pipeline', description='Pings ..') for host in ['google', 'amazon', 'facebook']: sub_pipeline.add( Task(id=f'ping_{host}', description=f'Pings {host}', commands=[RunBash(f'ping -c 3 {host}.com')])) sub_pipeline.add_dependency('ping_amazon', 'ping_facebook') sub_pipeline.add(Task(id='ping_foo', description='Pings foo', commands=[RunBash('ping foo')]), upstreams=['ping_amazon']) pipeline.add(sub_pipeline, upstreams=['ping_localhost']) pipeline.add(Task(id=‘sleep', description='Sleeps for 2 seconds', commands=[RunBash('sleep 2')]), upstreams=[‘sub_pipeline’]) !5 ETL pipelines as code @martin_loetzsch Pipeline = list of tasks with dependencies between them. Task = list of commands
  • 6. Target of computation 
 CREATE TABLE m_dim_next.region (
 region_id SMALLINT PRIMARY KEY,
 region_name TEXT NOT NULL UNIQUE,
 country_id SMALLINT NOT NULL,
 country_name TEXT NOT NULL,
 _region_name TEXT NOT NULL
 );
 
 Do computation and store result in table 
 WITH raw_region AS (SELECT DISTINCT country,
 region
 FROM m_data.ga_session
 ORDER BY country, region)
 
 INSERT INTO m_dim_next.region SELECT row_number() OVER (ORDER BY country, region ) AS region_id,
 CASE WHEN (SELECT count(DISTINCT country) FROM raw_region r2 WHERE r2.region = r1.region) > 1 THEN region || ' / ' || country ELSE region END AS region_name, dense_rank() OVER (ORDER BY country) AS country_id, country AS country_name, region AS _region_name FROM raw_region r1;
 INSERT INTO m_dim_next.region VALUES (-1, 'Unknown', -1, 'Unknown', 'Unknown');
 Speedup subsequent transformations 
 SELECT util.add_index( 'm_dim_next', 'region', column_names := ARRAY ['_region_name', ‘country_name', 'region_id']);
 
 SELECT util.add_index( 'm_dim_next', 'region', column_names := ARRAY ['country_id', 'region_id']);
 
 ANALYZE m_dim_next.region; !6 PostgreSQL as a data processing engine @martin_loetzsch Leave data in DB, Tables as (intermediate) results of processing steps
  • 7. Execute query 
 ExecuteSQL(sql_file_name=“preprocess-ad.sql") cat app/data_integration/pipelines/facebook/preprocess-ad.sql | PGTZ=Europe/Berlin PGOPTIONS=—client-min-messages=warning psql --username=mloetzsch --host=localhost --echo-all -—no-psqlrc --set ON_ERROR_STOP=on kfz_dwh_etl
 Read file 
 ReadFile(file_name=“country_iso_code.csv", compression=Compression.NONE, target_table="os_data.country_iso_code", mapper_script_file_name=“read-country-iso-codes.py", delimiter_char=“;") cat "dwh-data/country_iso_code.csv" | .venv/bin/python3.6 "app/data_integration/pipelines/load_data/ read-country-iso-codes.py" | PGTZ=Europe/Berlin PGOPTIONS=--client-min-messages=warning psql --username=mloetzsch --host=localhost --echo-all --no-psqlrc --set ON_ERROR_STOP=on kfz_dwh_etl --command="COPY os_data.country_iso_code FROM STDIN WITH CSV DELIMITER AS ';'"
 Copy from other databases 
 Copy(sql_file_name="pdm/load-product.sql", source_db_alias=“pdm", target_table=“os_data.product", replace={"@@db@@": "K24Pdm", "@@dbschema@@": “ps", "@@client@@": "kfzteile24 GmbH"})
 cat app/data_integration/pipelines/load_data/pdm/load-product.sql | sed "s/@@db@@/K24Pdm/g;s/@@dbschema@@/ps/g;s/@@client@@/ kfzteile24 GmbH/g" | sed 's/$/$/g;s/$/$/g' | (cat && echo ';') | (cat && echo '; go') | sqsh -U ***** -P ******* -S ******* -D K24Pdm -m csv | PGTZ=Europe/Berlin PGOPTIONS=--client-min-messages=warning psql --username=mloetzsch --host=localhost --echo-all --no-psqlrc --set ON_ERROR_STOP=on kfz_dwh_etl --command="COPY os_data.product FROM STDIN WITH CSV HEADER" !7 Shell commands as interface to data & DBs @martin_loetzsch Nothing is faster than a unix pipe
  • 8. Read a set of files 
 pipeline.add( ParallelReadFile( id="read_download", description="Loads PyPI downloads from pre_downloaded csv files", file_pattern="*/*/*/pypi/downloads-v1.csv.gz", read_mode=ReadMode.ONLY_NEW, compression=Compression.GZIP, target_table="pypi_data.download", delimiter_char="t", skip_header=True, csv_format=True, file_dependencies=read_download_file_dependencies, date_regex="^(?P<year>d{4})/(?P<month>d{2})/(? P<day>d{2})/", partition_target_table_by_day_id=True, timezone="UTC", commands_before=[ ExecuteSQL( sql_file_name="create_download_data_table.sql", file_dependencies=read_download_file_dependencies) ])) Split large joins into chunks pipeline.add( ParallelExecuteSQL( id="transform_download", description="Maps downloads to their dimensions", sql_statement="SELECT pypi_tmp.insert_download(@chunk@::SMALLINT);”, parameter_function= etl_tools.utils.chunk_parameter_function, parameter_placeholders=["@chunk@"], commands_before=[ ExecuteSQL(sql_file_name="transform_download.sql") ]), upstreams=["preprocess_project_version", "transform_installer"]) !8 Incremental & parallel processing @martin_loetzsch You can’t join all clicks with all customers at once
  • 9. Runnable app Integrates PyPI project download stats with 
 Github repo events !9 Try it out: Python project stats data warehouse @martin_loetzsch https://github.com/mara/mara-example-project
  • 10. !10 Refer us a data person, earn 200€ @martin_loetzsch Also analysts, developers, product managers