SlideShare a Scribd company logo
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Nested	
  Types	
  in	
  Impala
Alex	
  Behm	
  //	
  Cloudera,	
  Inc.	
  
Marcel	
  Kornacker	
  //	
  Cloudera,	
  Inc.	
  
Skye	
  Wanderman-­‐Milne	
  //	
  Cloudera,	
  Inc.
Impala	
  Meetup	
  03/24/2015
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Design	
  Goals
• Goals	
  
• Support	
  for	
  nested	
  data	
  types:	
  struct,	
  map,	
  array	
  
• Full	
  expressiveness	
  of	
  SQL	
  with	
  nested	
  structures	
  
• v1	
  Prioritization	
  
• Focus	
  on	
  SELECT	
  queries	
  (INSERT	
  in	
  later	
  releases)	
  
• Focus	
  on	
  native	
  Parquet	
  and	
  Avro	
  formats	
  (XML,	
  JSON,	
  etc	
  in	
  later	
  releases)	
  
• Focus	
  on	
  built-­‐in	
  language	
  expressiveness	
  (UDTF	
  extensibility	
  in	
  later	
  releases)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Example	
  Schema
CREATE TABLE Customers {
id BIGINT,
address STRUCT<
city: STRING,
zip: INT
>
orders ARRAY<STRUCT<
txn_time: TIMESTAMP,
cc: BIGINT,
items: ARRAY<STRUCT<
item_no: STRING,
price: DECIMAL(9,2)
>>
>>
preferred_cc BIGINT
}
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Syntax	
  Extensions
• Path	
  expressions	
  extend	
  column	
  references	
  to	
  scalars	
  (nested	
  structs)	
  
• Can	
  appear	
  anywhere	
  a	
  conventional	
  column	
  reference	
  is	
  used	
  
• Collections	
  (maps	
  and	
  arrays)	
  are	
  exposed	
  like	
  sub-­‐tables	
  
• Use	
  FROM	
  clause	
  to	
  specify	
  which	
  collections	
  to	
  read	
  like	
  conventional	
  tables	
  
• Can	
  use	
  JOIN	
  conditions	
  to	
  express	
  join	
  relationship	
  (default	
  is	
  INNER	
  JOIN)
Find	
  the	
  ids	
  and	
  city	
  of	
  customers	
  who	
  live	
  in	
  the	
  zip	
  code	
  94305:	
  
SELECT	
  id,	
  address.city	
  FROM	
  customers	
  WHERE	
  address.zip	
  =	
  94305
Find	
  all	
  orders	
  that	
  were	
  paid	
  for	
  with	
  a	
  customer’s	
  preferred	
  credit	
  card:	
  
SELECT	
  o.txn_id	
  FROM	
  customers	
  c,	
  c.orders	
  o	
  WHERE	
  o.cc	
  =	
  c.preferred_cc	
  	
  
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Referencing	
  Arrays	
  &	
  Maps
• Basic	
  idea:	
  Flatten	
  nested	
  collections	
  referenced	
  in	
  the	
  FROM	
  clause	
  
• Can	
  be	
  thought	
  of	
  as	
  an	
  implicit	
  join	
  on	
  the	
  parent/child	
  relationship
SELECT	
  c.id,	
  o.txn_id	
  FROM	
  customers	
  c,	
  c.orders	
  o
c.id o.txn_id
100 203
100 305
100 507
… …
101 10056
101 10
… …
id	
  of	
  a	
  customer	
  repeated	
  
for	
  every	
  order
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Referencing	
  Arrays	
  &	
  Maps
SELECT	
  c.id,	
  o.txn_id	
  FROM	
  customers	
  c,	
  c.orders	
  o	
  
SELECT	
  c.id,	
  o.txn_id	
  FROM	
  customer	
  c	
  INNER	
  JOIN	
  c.orders	
  o
Returns	
  customer/order	
  data	
  for	
  customers	
  that	
  have	
  at	
  least	
  one	
  order
SELECT	
  c.id,	
  o.txn_id	
  FROM	
  customers	
  c	
  LEFT	
  OUTER	
  JOIN	
  c.orders	
  o
Also	
  returns	
  customers	
  with	
  no	
  orders	
  (with	
  order	
  fields	
  NULL)
SELECT	
  c.id,	
  o.txn_id	
  FROM	
  customers	
  c	
  LEFT	
  ANTI	
  JOIN	
  c.orders	
  o
Find	
  all	
  customers	
  that	
  have	
  no	
  orders
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Motivation	
  for	
  Advanced	
  Querying	
  Capabilities
• Count	
  the	
  number	
  of	
  orders	
  per	
  customer	
  
• Count	
  the	
  number	
  of	
  items	
  per	
  order	
  
• Impractical	
  à Requires	
  unique	
  id	
  at	
  every	
  nesting	
  level	
  
• Information	
  is	
  already	
  expressed	
  in	
  nesting	
  relationship!	
  
• What	
  about	
  even	
  more	
  interesting	
  queries?	
  
• Get	
  the	
  number	
  of	
  orders	
  and	
  the	
  average	
  item	
  price	
  per	
  customer?	
  
• “Group	
  by”	
  multiple	
  nesting	
  levels
SELECT	
  COUNT(*)	
  FROM	
  customers	
  c,	
  c.orders	
  o	
  GROUP	
  BY	
  c.id
SELECT	
  COUNT(*)	
  FROM	
  customers.orders	
  o,	
  o.items	
  GROUP	
  BY	
  ???
Must	
  be	
  unique
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Advanced	
  Querying:	
  Correlated	
  Table	
  References
• Count	
  the	
  number	
  of	
  orders	
  per	
  customer	
  
• Count	
  the	
  number	
  of	
  items	
  per	
  order	
  
• Get	
  the	
  number	
  of	
  orders	
  and	
  the	
  average	
  item	
  price	
  per	
  customer
SELECT	
  cnt	
  FROM	
  customers	
  c,	
  (SELECT	
  COUNT(*)	
  cnt	
  FROM	
  c.orders)	
  v
SELECT	
  cnt	
  FROM	
  customers.orders	
  o,	
  (SELECT	
  COUNT(*)	
  cnt	
  FROM	
  o.items)	
  v
SELECT	
  c.id,	
  cnt,	
  avgp	
  
FROM	
  customer	
  c,	
  
	
  	
  (SELECT	
  count(1)	
  cnt	
  FROM	
  c.orders)	
  v1,	
  
	
  	
  (SELECT	
  avg(price)	
  avpg	
  FROM	
  c.orders.items)	
  v2
Correlated	
  reference	
  to	
  “c”	
  
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Advanced	
  Querying:	
  Correlated	
  Table	
  References
• Full	
  expressibility	
  
• Arbitrary	
  SQL	
  allowed	
  in	
  inline	
  views	
  and	
  subqueries	
  with	
  correlated	
  table	
  refs	
  
• Correlated	
  subqueries	
  transformed	
  into	
  joins	
  if	
  possible	
  
• Exploits	
  nesting	
  relationship	
  
• No	
  need	
  for	
  stored	
  unique	
  ids	
  at	
  various	
  levels	
  
• Similar	
  to	
  standard	
  SQL	
  ‘LATERAL’,	
  ‘CROSS	
  APPLY’,	
  ‘OUTER	
  CROSS	
  APPLY’	
  
• Goes	
  beyond	
  standard	
  in	
  some	
  aspects	
  (semi/anti	
  variants)	
  
• Not	
  as	
  general	
  as	
  SQL	
  standard	
  (limited	
  correlations)
SELECT	
  id	
  FROM	
  customers	
  c	
  WHERE	
  EXISTS	
  
	
  	
  (SELECT	
  1	
  FROM	
  c.orders	
  o,	
  o.items	
  i	
  where	
  o.cc	
  =	
  c.preferred_cc	
  and	
  i.price	
  >	
  100)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Execution	
  Model
Design	
  sweet	
  spot:	
  small/medium	
  sized	
  collections	
  (max	
  few	
  hundreds	
  of	
  MB)	
  
• Built-­‐in	
  limitation	
  on	
  the	
  size	
  of	
  nested	
  collections	
  (TBD)	
  
• Huge	
  collections	
  not	
  expected	
  to	
  be	
  performant	
  and	
  rare	
  
Execution	
  Overview	
  
• Scans	
  materialize	
  minimal	
  nested	
  structure	
  in	
  memory	
  
• Three	
  new	
  exec	
  nodes	
  
• Subplan:	
  Executes	
  its	
  subplan	
  tree	
  for	
  each	
  input	
  row	
  and	
  returns	
  the	
  rows	
  
produced	
  by	
  the	
  subplan	
  
• Nested	
  Row	
  Src:	
  Returns	
  the	
  current	
  input	
  row	
  of	
  its	
  parent	
  Subplan	
  node	
  
• Unnest:	
  Scans	
  an	
  array	
  slot	
  of	
  the	
  current	
  input	
  row	
  of	
  its	
  parent	
  Subplan	
  node	
  or	
  
of	
  an	
  output	
  row	
  from	
  its	
  child	
  plan	
  node,	
  returning	
  one	
  row	
  per	
  array	
  element.	
  
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Example	
  A
SELECT	
  count(*)	
  FROM	
  customer.orders.items
Scan	
  
orders.customer.items
Aggregate	
  
count(*)
Count	
  the	
  total	
  number	
  of	
  items
Exchange	
  
(to	
  client)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Example	
  B
SELECT	
  c.id,	
  cnt	
  
FROM	
  customer	
  c,	
  
JOIN	
  (SELECT	
  count(*)	
  cnt	
  FROM	
  c.orders)	
  v
Scan	
  c
Subplan
materialize	
  
c.orders
Unnest	
  
c.orders
Aggregate	
  
count(*)
Subplan	
  Pseudo-­‐Code	
  
result	
  =	
  {}	
  
for	
  each	
  c	
  in	
  input	
  
	
  	
  set	
  c	
  in	
  dependent	
  nodes	
  
	
  	
  result	
  +=	
  rhsPlan.exec()	
  
return	
  result
Nested	
  
Row	
  Src
Cross	
  Join
set	
  c	
  in	
  dependent	
  nodes
Count	
  the	
  number	
  of	
  orders	
  per	
  customer
Exchange	
  
(to	
  client)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Example	
  C
SELECT	
  c.id,	
  cnt,	
  avgp	
  
FROM	
  customer	
  c,	
  
	
  	
  (SELECT	
  count(*)	
  cnt	
  FROM	
  c.orders)	
  v1	
  
	
  	
  (SELECT	
  avg(price)	
  avpg	
  FROM	
  c.orders.items)	
  v2
Scan	
  c
Subplan
Unnest	
  
c.orders
Aggregate	
  
count(*)
Unnest	
  
c.orders.item
Aggregate	
  
avg(price)
Cross	
  Join
Return	
  the	
  number	
  of	
  orders	
  and	
  
the	
  average	
  item	
  price	
  per	
  customer
Nested	
  Row	
  
Src
Cross	
  Join
Exchange	
  
(to	
  client)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Example	
  D
SELECT	
  c.id,	
  cnt,	
  avgp	
  
FROM	
  customer	
  c,	
  
	
  	
  (SELECT	
  count(*)	
  cnt	
  FROM	
  c.orders	
  o	
  
	
  	
  	
  WHERE	
  (SELECT	
  sum(price)	
  FROM	
  o.items)	
  >	
  100))	
  v
Scan	
  c
Subplan
Unnest	
  
c.orders
Aggregate	
  
count(*)
Unnest	
  
o.items
Aggregate	
  
sum(price)	
  
sum(price)>100
Cross	
  Join
For	
  each	
  customer,	
  return	
  the	
  number	
  of	
  orders	
  
whose	
  total	
  item	
  price	
  exceeds	
  >	
  100
Subplan
Nested	
  Row	
  
Src:	
  c
Nested	
  Row	
  
Src:	
  o
Cross	
  Join
Exchange	
  
(to	
  client)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Future	
  Work
• Syntax	
  extensions	
  
• Performance	
  improvements	
  
• Parquet:	
  Scan	
  columns	
  directly	
  in	
  “unnest”,	
  avoid	
  collection	
  materialization	
  
• Codegen	
  subplans	
  
• INSERT	
  queries,	
  e.g.,	
  convert	
  flat	
  data	
  into	
  nested	
  data	
  
• UDTF	
  support	
  
• More	
  formats:	
  JSON,	
  XML,	
  etc.
SELECT	
   c.id,	
  
	
  	
   	
   count(c.orders),	
  
	
   	
   avg(c.orders.items.price)	
  
FROM	
  customer	
  c
SELECT	
  c.id,	
  cnt,	
  avgp	
  
FROM	
  customer	
  c,	
  
	
  	
  (SELECT	
  count(1)	
  cnt	
  FROM	
  c.orders)	
  v1	
  
	
  	
  (SELECT	
  avg(price)	
  avpg	
  FROM	
  c.orders.items)	
  v2
Rewrite
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Thank	
  you
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Appendix
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Comparison	
  of	
  Impala	
  and	
  HiveQL
• Impala’s	
  syntax	
  provides	
  a	
  superset	
  of	
  Hive’s	
  functionality	
  
• HiveQL	
  has	
  similar	
  path	
  expressions	
  but	
  with	
  restrictions	
  
• Must	
  use	
  LATERAL	
  VIEW	
  in	
  FROM	
  clause;	
  more	
  verbose	
  syntax	
  
• LATERAL	
  VIEWs	
  themselves	
  have	
  many	
  restrictions,	
  no	
  arbitrary	
  SQL	
  
• Requires	
  complex	
  joins	
  or	
  unique	
  ids	
  at	
  various	
  nesting	
  levels	
  for	
  expressing	
  even	
  simple	
  queries	
  (e.g.,	
  
find	
  number	
  of	
  orders	
  per	
  customer)	
  
• Does	
  not	
  provide	
  similar	
  inline	
  view	
  and	
  subquery	
  capabilities
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Syntax	
  vs.	
  Hive	
  Syntax
SELECT	
  …	
  FROM	
  customer	
  c,	
  c.orders	
  o,	
  o.items	
  i
SELECT	
  …	
  FROM	
  customer	
  c	
  
LATERAL	
  VIEW	
  explode(c.orders)	
  as	
  (c1,	
  c2…)	
  
LATERAL	
  VIEW	
  explode(c.orders.items)	
  as	
  (c1,	
  c2…)
SELECT	
  …	
  FROM	
  customer	
  c	
  
LEFT	
  JOIN	
  c.orders	
  LEFT	
  JOIN	
  c.orders.items
SELECT	
  …	
  FROM	
  customer	
  c	
  
OUTER	
  LATERAL	
  VIEW	
  explode(c.orders)	
  as	
  (c1,	
  …)	
  
OUTER	
  LATERAL	
  VIEW	
  explode(c.orders.items)	
  as	
  (c1,	
  …)
SELECT	
  c.id	
  FROM	
  customer	
  c,	
  
LEFT	
  ANTI	
  JOIN	
  (SELECT	
  oid	
  FROM	
  c.orders)	
  v	
  
ON	
  c.preferred_cc	
  =	
  v.cc
SELECT	
  c.id	
  FROM	
  customer	
  c	
  
WHERE	
  NOT	
  EXISTS	
  (SELECT	
  oid	
  FROM	
  c.orders	
  WHERE	
  
c.preferred_cc	
  =	
  orders.cc)
No	
  convenient/performant	
  equivalent	
  in	
  Hive.
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Syntax	
  vs.	
  Hive	
  Syntax
SELECT	
  …	
  
FROM	
  customer	
  c	
  
LATERAL	
  VIEW	
  MY_UDTF(c.orders)	
  as	
  (c1,	
  c2…)
Impala	
  will	
  not	
  support	
  Hive’s	
  builtin	
  or	
  user-­‐defined	
  
table	
  generating	
  functions	
  for	
  now.
SELECT	
  …	
  
FROM	
  customer	
  c	
  
LATERAL	
  VIEW	
  json_tuple(c.json_str,	
  …)	
  as	
  (c1,	
  c2…)
SELECT	
  count(c.orders)	
  FROM	
  customer	
  c
SELECT	
  cnt	
  FROM	
  customer	
  c,	
  
JOIN	
  (SELECT	
  count(1)	
  cnt	
  FROM	
  c.orders)	
  v1
SELECT	
  count(1)	
  
FROM	
  customer	
  c	
  
LATERAL	
  VIEW	
  explode(c.orders)	
  as	
  (c1,	
  c2…)	
  
GROUP	
  BY	
  c.orders	
  
(in	
  absence	
  of	
  a	
  unique	
  key	
  in	
  ‘customer’)
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Syntax	
  vs.	
  Hive	
  Syntax
SELECT	
   c.id,	
  
	
  	
   	
   count(c.orders),	
  
	
   	
   avg(c.orders.items.price)	
  
FROM	
  customer	
  c
SELECT	
  c.id,	
  cnt,	
  avgp	
  
FROM	
  customer	
  c,	
  
JOIN	
  (SELECT	
  count(1)	
  cnt	
  FROM	
  c.orders)	
  v1	
  
JOIN	
  (SELECT	
  avg(price)	
  avpg	
  FROM	
  c.orders.items)	
  v2
No	
  convenient/performant	
  equivalent	
  in	
  Hive.
• Impala	
  is	
  more	
  expressive,	
  but	
  less	
  extensible	
  until	
  UDTFs	
  are	
  supported	
  
• More	
  join	
  types:	
  inner/outer/semi/anti	
  
• Full	
  SQL	
  block	
  inside	
  correlated	
  inline	
  view	
  
• Hive	
  more	
  extensible	
  (UDTFs),	
  has	
  builtin	
  UDTFs	
  
• Hive	
  Lateral	
  View	
  very	
  rigid,	
  no	
  arbitrary	
  SQL	
  inside
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Nested	
  Types	
  in	
  Action
Josh	
  Will’s	
  Blog	
  post	
  on	
  analyzing	
  misspelled	
  queries	
  
http://blog.cloudera.com/blog/2014/08/how-­‐to-­‐count-­‐events-­‐like-­‐a-­‐data-­‐scientist/	
  
• Goal:	
  Rudimentary	
  spell	
  checker	
  based	
  on	
  counting	
  query/click	
  events	
  
• Problem:	
  Cross	
  referencing	
  items	
  in	
  multiple	
  nested	
  collections	
  
• Representative	
  of	
  many	
  machine	
  learning	
  tasks	
  (Josh	
  tells	
  me)	
  
• Goal	
  was	
  not	
  naturally	
  achievable	
  with	
  Hive	
  	
  
• Josh	
  implemented	
  a	
  custom	
  Hive	
  extension	
  “WITHIN”	
  
• How	
  can	
  Impala	
  serve	
  this	
  use	
  case?
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Nested	
  Types	
  in	
  Action
account_id:	
  bigint	
  
search_events:	
  array<struct<	
  
	
  	
  event_id:	
  bigint	
  
	
  	
  query:	
  string	
  
	
  	
  tstamp_sec:	
  bigint	
  
	
  	
  ...	
  
>>	
  
install_events:	
  array<struct<	
  
	
  	
  event_id:	
  bigint	
  
	
  	
  search_event_id:	
  bigint	
  
	
  	
  app_id:	
  bigint	
  
	
  	
  ...	
  
>>
SELECT	
  a.qw,	
  a.qr,	
  count(*)	
  as	
  cnt	
  
FROM	
  sessions	
  
LATERAL	
  VIEW	
  WITHIN(	
  
	
  	
  "SELECT	
  bad.query	
  qw,	
  good.query	
  qr	
  
	
  	
  FROM	
  t1	
  as	
  bad,	
  t1	
  as	
  good	
  
	
  	
  WHERE	
  bad.tstamp_sec	
  <	
  good.tstamp_sec	
  
	
  	
  AND	
  good.tstamp_sec	
  -­‐	
  bad.tstamp_sec	
  <	
  30	
  
	
  	
  AND	
  bad.event_id	
  NOT	
  IN	
  (select	
  search_event_id	
  FROM	
  t2)	
  
	
  	
  AND	
  good.event_id	
  IN	
  (select	
  search_event_id	
  FROM	
  t2)",	
  
search_events,	
  install_events)	
  a	
  
GROUP	
  BY	
  a.qw,	
  a.qr;
SELECT	
  bad.query,	
  good.query,	
  count(*)	
  as	
  cnt	
  
FROM	
  sessions	
  s,	
  
	
  	
  s.search_events	
  bad,	
  
	
  	
  s.search_events	
  good,	
  
WHERE	
  bad.tstamp_sec	
  <	
  good.tstamp_sec	
  
	
  	
  AND	
  good.tstamp_sec	
  -­‐	
  bad.tstamp_sec	
  <	
  30	
  
	
  	
  AND	
  bad.event_id	
  NOT	
  IN	
  (select	
  search_event_id	
  FROM	
  s.install_events)	
  
	
  	
  AND	
  good.event_id	
  IN	
  (select	
  search_event_id	
  FROM	
  s.install_events),	
  
GROUP	
  BY	
  bad.query,	
  good.query;
Josh’s	
  HiveQL	
  extension
Impala	
  SQL
Schema
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Nested	
  Types	
  in	
  Action
SELECT	
  bad.query,	
  good.query,	
  count(*)	
  as	
  cnt	
  
FROM	
  sessions	
  s,	
  
	
  	
  s.search_events	
  bad,	
  
	
  	
  s.search_events	
  good,	
  
WHERE	
  bad.tstamp_sec	
  <	
  good.tstamp_sec	
  
	
  	
  AND	
  good.tstamp_sec	
  -­‐	
  bad.tstamp_sec	
  <	
  30	
  
	
  	
  AND	
  bad.event_id	
  NOT	
  IN	
  (select	
  search_event_id	
  FROM	
  s.install_events)	
  
	
  	
  AND	
  good.event_id	
  IN	
  (select	
  search_event_id	
  FROM	
  s.install_events),	
  
GROUP	
  BY	
  bad.query,	
  good.query;
SELECT	
  bad.query,	
  good.query,	
  count(*)	
  as	
  cnt	
  
FROM	
  sessions	
  s	
  
JOIN	
  (SELECT	
  *	
  FROM	
  s.search_events)	
  bad,	
  
JOIN	
  (SELECT	
  *	
  FROM	
  s.search_events)	
  good,	
  
LEFT	
  ANTI	
  JOIN	
  (SELECT	
  search_event_id	
  FROM	
  s.install_events)	
  v1	
  
ON	
  (bad.event_id	
  =	
  v1.install_events)	
  
LEFT	
  SEMI	
  JOIN	
  (SELECT	
  search_event_id	
  FROM	
  s.install_events)	
  v2	
  
ON	
  (good.event_id	
  =	
  v2.search_event_id)	
  
WHERE	
  bad.tstamp_sec	
  <	
  good.tstamp_sec	
  
	
  	
  AND	
  good.tstamp_sec	
  -­‐	
  bad.tstamp_sec	
  <	
  30	
  
GROUP	
  BY	
  bad.query,	
  good.query;
Rewrite
‹#›©	
  Cloudera,	
  Inc.	
  All	
  rights	
  reserved.
Impala	
  Nested	
  Types	
  in	
  Action
Scan	
  s
Subplan
Cross	
  Join	
  
bad.ts	
  <	
  good.ts	
  AND	
  
good.ts	
  –	
  bad.ts	
  <	
  30
Unnest	
  
s.search_events	
  good
Left	
  Anti	
  Join	
  
bad.event_id	
  =	
  
v1.search_event_id
Unnest	
  
s.install_events	
  v1
Left	
  Semi	
  Join	
  
good.event_id	
  =	
  
v2.search_event_id
Unnest	
  
s.install_events	
  v2
Aggregate	
  
count(*)	
  group	
  by	
  
bad.query,	
  good.query
Unnest	
  
s.search_events	
  good
SELECT	
  bad.query,	
  good.query,	
  count(*)	
  as	
  cnt	
  
FROM	
  sessions	
  s	
  
JOIN	
  (SELECT	
  *	
  FROM	
  s.search_events)	
  bad,	
  
JOIN	
  (SELECT	
  *	
  FROM	
  s.search_events)	
  good,	
  
LEFT	
  ANTI	
  JOIN	
  (SELECT	
  search_event_id	
  FROM	
  s.install_events)	
  v1	
  
ON	
  (bad.event_id	
  =	
  v1.install_events)	
  
LEFT	
  SEMI	
  JOIN	
  (SELECT	
  search_event_id	
  FROM	
  s.install_events)	
  v2	
  
ON	
  (good.event_id	
  =	
  v2.search_event_id)	
  
WHERE	
  bad.tstamp_sec	
  <	
  good.tstamp_sec	
  
	
  	
  AND	
  good.tstamp_sec	
  -­‐	
  bad.tstamp_sec	
  <	
  30	
  
GROUP	
  BY	
  bad.query,	
  good.query;

More Related Content

PDF
Extensible Data Modeling
PPTX
Geek Sync | Rewriting Bad SQL Code 101
PDF
2 designing tables
PDF
Dare to build vertical design with relational data (Entity-Attribute-Value)
PPTX
Love Your Database Railsconf 2017
PDF
Avoiding cursors with sql server 2005 tech republic
PDF
Access 04
PPTX
Couchbase N1QL: Index Advisor
Extensible Data Modeling
Geek Sync | Rewriting Bad SQL Code 101
2 designing tables
Dare to build vertical design with relational data (Entity-Attribute-Value)
Love Your Database Railsconf 2017
Avoiding cursors with sql server 2005 tech republic
Access 04
Couchbase N1QL: Index Advisor

Similar to Nested Types in Impala (20)

PPTX
Data Modeling for Data Science: Simplify Your Workload with Complex Types in ...
PDF
Marcel Kornacker, Software Enginner at Cloudera - "Data modeling for data sci...
PPTX
Friction-free ETL: Automating data transformation with Impala | Strata + Hado...
PPTX
Hug meetup impala 2.5 performance overview
PPTX
Apache Impala (incubating) 2.5 Performance Update
PDF
Impala: Real-time Queries in Hadoop
PPTX
Jethro data meetup index base sql on hadoop - oct-2014
PPTX
The Evolution of a Relational Database Layer over HBase
PPTX
Couchbase Tutorial: Big data Open Source Systems: VLDB2018
PDF
Advanced MariaDB features that developers love.pdf
PPTX
Performance Optimizations in Apache Impala
PDF
Tajolabigdatacamp2014 140618135810-phpapp01 hyunsik-choi
PDF
Cloudera Impala: A modern SQL Query Engine for Hadoop
PDF
Tajo_Meetup_20141120
PPTX
Querying NoSQL with SQL: HAVING Your JSON Cake and SELECTing it too
PPTX
Querying NoSQL with SQL - KCDC - August 2017
PDF
Accelerating distributed joins in Apache Hive: Runtime filtering enhancements
PDF
Self-serve analytics journey at Celtra: Snowflake, Spark, and Databricks
PPTX
SQL Query Optimization | SQL Query Optimization Techniques | SQL Basics | SQL...
PPTX
New Performance Benchmarks: Apache Impala (incubating) Leads Traditional Anal...
Data Modeling for Data Science: Simplify Your Workload with Complex Types in ...
Marcel Kornacker, Software Enginner at Cloudera - "Data modeling for data sci...
Friction-free ETL: Automating data transformation with Impala | Strata + Hado...
Hug meetup impala 2.5 performance overview
Apache Impala (incubating) 2.5 Performance Update
Impala: Real-time Queries in Hadoop
Jethro data meetup index base sql on hadoop - oct-2014
The Evolution of a Relational Database Layer over HBase
Couchbase Tutorial: Big data Open Source Systems: VLDB2018
Advanced MariaDB features that developers love.pdf
Performance Optimizations in Apache Impala
Tajolabigdatacamp2014 140618135810-phpapp01 hyunsik-choi
Cloudera Impala: A modern SQL Query Engine for Hadoop
Tajo_Meetup_20141120
Querying NoSQL with SQL: HAVING Your JSON Cake and SELECTing it too
Querying NoSQL with SQL - KCDC - August 2017
Accelerating distributed joins in Apache Hive: Runtime filtering enhancements
Self-serve analytics journey at Celtra: Snowflake, Spark, and Databricks
SQL Query Optimization | SQL Query Optimization Techniques | SQL Basics | SQL...
New Performance Benchmarks: Apache Impala (incubating) Leads Traditional Anal...
Ad

Recently uploaded (20)

PDF
Machine learning based COVID-19 study performance prediction
PPTX
Understanding_Digital_Forensics_Presentation.pptx
PDF
cuic standard and advanced reporting.pdf
PPTX
VMware vSphere Foundation How to Sell Presentation-Ver1.4-2-14-2024.pptx
PDF
NewMind AI Weekly Chronicles - August'25 Week I
PDF
Diabetes mellitus diagnosis method based random forest with bat algorithm
PDF
TokAI - TikTok AI Agent : The First AI Application That Analyzes 10,000+ Vira...
PDF
Empathic Computing: Creating Shared Understanding
PPTX
sap open course for s4hana steps from ECC to s4
PPTX
Programs and apps: productivity, graphics, security and other tools
PDF
Building Integrated photovoltaic BIPV_UPV.pdf
DOCX
The AUB Centre for AI in Media Proposal.docx
PPTX
Detection-First SIEM: Rule Types, Dashboards, and Threat-Informed Strategy
PPTX
MYSQL Presentation for SQL database connectivity
PDF
Profit Center Accounting in SAP S/4HANA, S4F28 Col11
PDF
Blue Purple Modern Animated Computer Science Presentation.pdf.pdf
PDF
Encapsulation theory and applications.pdf
PDF
Reach Out and Touch Someone: Haptics and Empathic Computing
PDF
Chapter 3 Spatial Domain Image Processing.pdf
PDF
7 ChatGPT Prompts to Help You Define Your Ideal Customer Profile.pdf
Machine learning based COVID-19 study performance prediction
Understanding_Digital_Forensics_Presentation.pptx
cuic standard and advanced reporting.pdf
VMware vSphere Foundation How to Sell Presentation-Ver1.4-2-14-2024.pptx
NewMind AI Weekly Chronicles - August'25 Week I
Diabetes mellitus diagnosis method based random forest with bat algorithm
TokAI - TikTok AI Agent : The First AI Application That Analyzes 10,000+ Vira...
Empathic Computing: Creating Shared Understanding
sap open course for s4hana steps from ECC to s4
Programs and apps: productivity, graphics, security and other tools
Building Integrated photovoltaic BIPV_UPV.pdf
The AUB Centre for AI in Media Proposal.docx
Detection-First SIEM: Rule Types, Dashboards, and Threat-Informed Strategy
MYSQL Presentation for SQL database connectivity
Profit Center Accounting in SAP S/4HANA, S4F28 Col11
Blue Purple Modern Animated Computer Science Presentation.pdf.pdf
Encapsulation theory and applications.pdf
Reach Out and Touch Someone: Haptics and Empathic Computing
Chapter 3 Spatial Domain Image Processing.pdf
7 ChatGPT Prompts to Help You Define Your Ideal Customer Profile.pdf
Ad

Nested Types in Impala

  • 1. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Nested  Types  in  Impala Alex  Behm  //  Cloudera,  Inc.   Marcel  Kornacker  //  Cloudera,  Inc.   Skye  Wanderman-­‐Milne  //  Cloudera,  Inc. Impala  Meetup  03/24/2015
  • 2. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Design  Goals • Goals   • Support  for  nested  data  types:  struct,  map,  array   • Full  expressiveness  of  SQL  with  nested  structures   • v1  Prioritization   • Focus  on  SELECT  queries  (INSERT  in  later  releases)   • Focus  on  native  Parquet  and  Avro  formats  (XML,  JSON,  etc  in  later  releases)   • Focus  on  built-­‐in  language  expressiveness  (UDTF  extensibility  in  later  releases)
  • 3. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Example  Schema CREATE TABLE Customers { id BIGINT, address STRUCT< city: STRING, zip: INT > orders ARRAY<STRUCT< txn_time: TIMESTAMP, cc: BIGINT, items: ARRAY<STRUCT< item_no: STRING, price: DECIMAL(9,2) >> >> preferred_cc BIGINT }
  • 4. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Syntax  Extensions • Path  expressions  extend  column  references  to  scalars  (nested  structs)   • Can  appear  anywhere  a  conventional  column  reference  is  used   • Collections  (maps  and  arrays)  are  exposed  like  sub-­‐tables   • Use  FROM  clause  to  specify  which  collections  to  read  like  conventional  tables   • Can  use  JOIN  conditions  to  express  join  relationship  (default  is  INNER  JOIN) Find  the  ids  and  city  of  customers  who  live  in  the  zip  code  94305:   SELECT  id,  address.city  FROM  customers  WHERE  address.zip  =  94305 Find  all  orders  that  were  paid  for  with  a  customer’s  preferred  credit  card:   SELECT  o.txn_id  FROM  customers  c,  c.orders  o  WHERE  o.cc  =  c.preferred_cc    
  • 5. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Referencing  Arrays  &  Maps • Basic  idea:  Flatten  nested  collections  referenced  in  the  FROM  clause   • Can  be  thought  of  as  an  implicit  join  on  the  parent/child  relationship SELECT  c.id,  o.txn_id  FROM  customers  c,  c.orders  o c.id o.txn_id 100 203 100 305 100 507 … … 101 10056 101 10 … … id  of  a  customer  repeated   for  every  order
  • 6. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Referencing  Arrays  &  Maps SELECT  c.id,  o.txn_id  FROM  customers  c,  c.orders  o   SELECT  c.id,  o.txn_id  FROM  customer  c  INNER  JOIN  c.orders  o Returns  customer/order  data  for  customers  that  have  at  least  one  order SELECT  c.id,  o.txn_id  FROM  customers  c  LEFT  OUTER  JOIN  c.orders  o Also  returns  customers  with  no  orders  (with  order  fields  NULL) SELECT  c.id,  o.txn_id  FROM  customers  c  LEFT  ANTI  JOIN  c.orders  o Find  all  customers  that  have  no  orders
  • 7. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Motivation  for  Advanced  Querying  Capabilities • Count  the  number  of  orders  per  customer   • Count  the  number  of  items  per  order   • Impractical  à Requires  unique  id  at  every  nesting  level   • Information  is  already  expressed  in  nesting  relationship!   • What  about  even  more  interesting  queries?   • Get  the  number  of  orders  and  the  average  item  price  per  customer?   • “Group  by”  multiple  nesting  levels SELECT  COUNT(*)  FROM  customers  c,  c.orders  o  GROUP  BY  c.id SELECT  COUNT(*)  FROM  customers.orders  o,  o.items  GROUP  BY  ??? Must  be  unique
  • 8. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Advanced  Querying:  Correlated  Table  References • Count  the  number  of  orders  per  customer   • Count  the  number  of  items  per  order   • Get  the  number  of  orders  and  the  average  item  price  per  customer SELECT  cnt  FROM  customers  c,  (SELECT  COUNT(*)  cnt  FROM  c.orders)  v SELECT  cnt  FROM  customers.orders  o,  (SELECT  COUNT(*)  cnt  FROM  o.items)  v SELECT  c.id,  cnt,  avgp   FROM  customer  c,      (SELECT  count(1)  cnt  FROM  c.orders)  v1,      (SELECT  avg(price)  avpg  FROM  c.orders.items)  v2 Correlated  reference  to  “c”  
  • 9. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Advanced  Querying:  Correlated  Table  References • Full  expressibility   • Arbitrary  SQL  allowed  in  inline  views  and  subqueries  with  correlated  table  refs   • Correlated  subqueries  transformed  into  joins  if  possible   • Exploits  nesting  relationship   • No  need  for  stored  unique  ids  at  various  levels   • Similar  to  standard  SQL  ‘LATERAL’,  ‘CROSS  APPLY’,  ‘OUTER  CROSS  APPLY’   • Goes  beyond  standard  in  some  aspects  (semi/anti  variants)   • Not  as  general  as  SQL  standard  (limited  correlations) SELECT  id  FROM  customers  c  WHERE  EXISTS      (SELECT  1  FROM  c.orders  o,  o.items  i  where  o.cc  =  c.preferred_cc  and  i.price  >  100)
  • 10. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Execution  Model Design  sweet  spot:  small/medium  sized  collections  (max  few  hundreds  of  MB)   • Built-­‐in  limitation  on  the  size  of  nested  collections  (TBD)   • Huge  collections  not  expected  to  be  performant  and  rare   Execution  Overview   • Scans  materialize  minimal  nested  structure  in  memory   • Three  new  exec  nodes   • Subplan:  Executes  its  subplan  tree  for  each  input  row  and  returns  the  rows   produced  by  the  subplan   • Nested  Row  Src:  Returns  the  current  input  row  of  its  parent  Subplan  node   • Unnest:  Scans  an  array  slot  of  the  current  input  row  of  its  parent  Subplan  node  or   of  an  output  row  from  its  child  plan  node,  returning  one  row  per  array  element.  
  • 11. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Example  A SELECT  count(*)  FROM  customer.orders.items Scan   orders.customer.items Aggregate   count(*) Count  the  total  number  of  items Exchange   (to  client)
  • 12. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Example  B SELECT  c.id,  cnt   FROM  customer  c,   JOIN  (SELECT  count(*)  cnt  FROM  c.orders)  v Scan  c Subplan materialize   c.orders Unnest   c.orders Aggregate   count(*) Subplan  Pseudo-­‐Code   result  =  {}   for  each  c  in  input      set  c  in  dependent  nodes      result  +=  rhsPlan.exec()   return  result Nested   Row  Src Cross  Join set  c  in  dependent  nodes Count  the  number  of  orders  per  customer Exchange   (to  client)
  • 13. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Example  C SELECT  c.id,  cnt,  avgp   FROM  customer  c,      (SELECT  count(*)  cnt  FROM  c.orders)  v1      (SELECT  avg(price)  avpg  FROM  c.orders.items)  v2 Scan  c Subplan Unnest   c.orders Aggregate   count(*) Unnest   c.orders.item Aggregate   avg(price) Cross  Join Return  the  number  of  orders  and   the  average  item  price  per  customer Nested  Row   Src Cross  Join Exchange   (to  client)
  • 14. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Example  D SELECT  c.id,  cnt,  avgp   FROM  customer  c,      (SELECT  count(*)  cnt  FROM  c.orders  o        WHERE  (SELECT  sum(price)  FROM  o.items)  >  100))  v Scan  c Subplan Unnest   c.orders Aggregate   count(*) Unnest   o.items Aggregate   sum(price)   sum(price)>100 Cross  Join For  each  customer,  return  the  number  of  orders   whose  total  item  price  exceeds  >  100 Subplan Nested  Row   Src:  c Nested  Row   Src:  o Cross  Join Exchange   (to  client)
  • 15. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Future  Work • Syntax  extensions   • Performance  improvements   • Parquet:  Scan  columns  directly  in  “unnest”,  avoid  collection  materialization   • Codegen  subplans   • INSERT  queries,  e.g.,  convert  flat  data  into  nested  data   • UDTF  support   • More  formats:  JSON,  XML,  etc. SELECT   c.id,         count(c.orders),       avg(c.orders.items.price)   FROM  customer  c SELECT  c.id,  cnt,  avgp   FROM  customer  c,      (SELECT  count(1)  cnt  FROM  c.orders)  v1      (SELECT  avg(price)  avpg  FROM  c.orders.items)  v2 Rewrite
  • 16. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Thank  you
  • 17. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Appendix
  • 18. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Comparison  of  Impala  and  HiveQL • Impala’s  syntax  provides  a  superset  of  Hive’s  functionality   • HiveQL  has  similar  path  expressions  but  with  restrictions   • Must  use  LATERAL  VIEW  in  FROM  clause;  more  verbose  syntax   • LATERAL  VIEWs  themselves  have  many  restrictions,  no  arbitrary  SQL   • Requires  complex  joins  or  unique  ids  at  various  nesting  levels  for  expressing  even  simple  queries  (e.g.,   find  number  of  orders  per  customer)   • Does  not  provide  similar  inline  view  and  subquery  capabilities
  • 19. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Syntax  vs.  Hive  Syntax SELECT  …  FROM  customer  c,  c.orders  o,  o.items  i SELECT  …  FROM  customer  c   LATERAL  VIEW  explode(c.orders)  as  (c1,  c2…)   LATERAL  VIEW  explode(c.orders.items)  as  (c1,  c2…) SELECT  …  FROM  customer  c   LEFT  JOIN  c.orders  LEFT  JOIN  c.orders.items SELECT  …  FROM  customer  c   OUTER  LATERAL  VIEW  explode(c.orders)  as  (c1,  …)   OUTER  LATERAL  VIEW  explode(c.orders.items)  as  (c1,  …) SELECT  c.id  FROM  customer  c,   LEFT  ANTI  JOIN  (SELECT  oid  FROM  c.orders)  v   ON  c.preferred_cc  =  v.cc SELECT  c.id  FROM  customer  c   WHERE  NOT  EXISTS  (SELECT  oid  FROM  c.orders  WHERE   c.preferred_cc  =  orders.cc) No  convenient/performant  equivalent  in  Hive.
  • 20. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Syntax  vs.  Hive  Syntax SELECT  …   FROM  customer  c   LATERAL  VIEW  MY_UDTF(c.orders)  as  (c1,  c2…) Impala  will  not  support  Hive’s  builtin  or  user-­‐defined   table  generating  functions  for  now. SELECT  …   FROM  customer  c   LATERAL  VIEW  json_tuple(c.json_str,  …)  as  (c1,  c2…) SELECT  count(c.orders)  FROM  customer  c SELECT  cnt  FROM  customer  c,   JOIN  (SELECT  count(1)  cnt  FROM  c.orders)  v1 SELECT  count(1)   FROM  customer  c   LATERAL  VIEW  explode(c.orders)  as  (c1,  c2…)   GROUP  BY  c.orders   (in  absence  of  a  unique  key  in  ‘customer’)
  • 21. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Syntax  vs.  Hive  Syntax SELECT   c.id,         count(c.orders),       avg(c.orders.items.price)   FROM  customer  c SELECT  c.id,  cnt,  avgp   FROM  customer  c,   JOIN  (SELECT  count(1)  cnt  FROM  c.orders)  v1   JOIN  (SELECT  avg(price)  avpg  FROM  c.orders.items)  v2 No  convenient/performant  equivalent  in  Hive. • Impala  is  more  expressive,  but  less  extensible  until  UDTFs  are  supported   • More  join  types:  inner/outer/semi/anti   • Full  SQL  block  inside  correlated  inline  view   • Hive  more  extensible  (UDTFs),  has  builtin  UDTFs   • Hive  Lateral  View  very  rigid,  no  arbitrary  SQL  inside
  • 22. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Nested  Types  in  Action Josh  Will’s  Blog  post  on  analyzing  misspelled  queries   http://blog.cloudera.com/blog/2014/08/how-­‐to-­‐count-­‐events-­‐like-­‐a-­‐data-­‐scientist/   • Goal:  Rudimentary  spell  checker  based  on  counting  query/click  events   • Problem:  Cross  referencing  items  in  multiple  nested  collections   • Representative  of  many  machine  learning  tasks  (Josh  tells  me)   • Goal  was  not  naturally  achievable  with  Hive     • Josh  implemented  a  custom  Hive  extension  “WITHIN”   • How  can  Impala  serve  this  use  case?
  • 23. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Nested  Types  in  Action account_id:  bigint   search_events:  array<struct<      event_id:  bigint      query:  string      tstamp_sec:  bigint      ...   >>   install_events:  array<struct<      event_id:  bigint      search_event_id:  bigint      app_id:  bigint      ...   >> SELECT  a.qw,  a.qr,  count(*)  as  cnt   FROM  sessions   LATERAL  VIEW  WITHIN(      "SELECT  bad.query  qw,  good.query  qr      FROM  t1  as  bad,  t1  as  good      WHERE  bad.tstamp_sec  <  good.tstamp_sec      AND  good.tstamp_sec  -­‐  bad.tstamp_sec  <  30      AND  bad.event_id  NOT  IN  (select  search_event_id  FROM  t2)      AND  good.event_id  IN  (select  search_event_id  FROM  t2)",   search_events,  install_events)  a   GROUP  BY  a.qw,  a.qr; SELECT  bad.query,  good.query,  count(*)  as  cnt   FROM  sessions  s,      s.search_events  bad,      s.search_events  good,   WHERE  bad.tstamp_sec  <  good.tstamp_sec      AND  good.tstamp_sec  -­‐  bad.tstamp_sec  <  30      AND  bad.event_id  NOT  IN  (select  search_event_id  FROM  s.install_events)      AND  good.event_id  IN  (select  search_event_id  FROM  s.install_events),   GROUP  BY  bad.query,  good.query; Josh’s  HiveQL  extension Impala  SQL Schema
  • 24. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Nested  Types  in  Action SELECT  bad.query,  good.query,  count(*)  as  cnt   FROM  sessions  s,      s.search_events  bad,      s.search_events  good,   WHERE  bad.tstamp_sec  <  good.tstamp_sec      AND  good.tstamp_sec  -­‐  bad.tstamp_sec  <  30      AND  bad.event_id  NOT  IN  (select  search_event_id  FROM  s.install_events)      AND  good.event_id  IN  (select  search_event_id  FROM  s.install_events),   GROUP  BY  bad.query,  good.query; SELECT  bad.query,  good.query,  count(*)  as  cnt   FROM  sessions  s   JOIN  (SELECT  *  FROM  s.search_events)  bad,   JOIN  (SELECT  *  FROM  s.search_events)  good,   LEFT  ANTI  JOIN  (SELECT  search_event_id  FROM  s.install_events)  v1   ON  (bad.event_id  =  v1.install_events)   LEFT  SEMI  JOIN  (SELECT  search_event_id  FROM  s.install_events)  v2   ON  (good.event_id  =  v2.search_event_id)   WHERE  bad.tstamp_sec  <  good.tstamp_sec      AND  good.tstamp_sec  -­‐  bad.tstamp_sec  <  30   GROUP  BY  bad.query,  good.query; Rewrite
  • 25. ‹#›©  Cloudera,  Inc.  All  rights  reserved. Impala  Nested  Types  in  Action Scan  s Subplan Cross  Join   bad.ts  <  good.ts  AND   good.ts  –  bad.ts  <  30 Unnest   s.search_events  good Left  Anti  Join   bad.event_id  =   v1.search_event_id Unnest   s.install_events  v1 Left  Semi  Join   good.event_id  =   v2.search_event_id Unnest   s.install_events  v2 Aggregate   count(*)  group  by   bad.query,  good.query Unnest   s.search_events  good SELECT  bad.query,  good.query,  count(*)  as  cnt   FROM  sessions  s   JOIN  (SELECT  *  FROM  s.search_events)  bad,   JOIN  (SELECT  *  FROM  s.search_events)  good,   LEFT  ANTI  JOIN  (SELECT  search_event_id  FROM  s.install_events)  v1   ON  (bad.event_id  =  v1.install_events)   LEFT  SEMI  JOIN  (SELECT  search_event_id  FROM  s.install_events)  v2   ON  (good.event_id  =  v2.search_event_id)   WHERE  bad.tstamp_sec  <  good.tstamp_sec      AND  good.tstamp_sec  -­‐  bad.tstamp_sec  <  30   GROUP  BY  bad.query,  good.query;