Python queue solution with asyncio and kafka

Queue with
asyncio and Kafka
Showcase
Ondřej Veselý

Problem:
store JSON to database
Just a few records
per second.
But
● Slow database
● Unreliable database
● Increasing traffic (20x)

def save_data(conn, cur, ts, data):
cur.execute(
"""INSERT INTO data (timestamp, data)
VALUES (%s,%s) """, (ts, ujson.dumps(data)))
conn.commit()
@app.route('/store', method=['PUT', 'POST'])
def logstash_route():
data = ujson.load(request.body)
conn = psycopg2.connect(**config.pg_logs)
t = datetime.now()
with conn.cursor(cursor_factory=DictCursor) as cur:
for d in data:
save_data(conn, cur, t, d)
conn.close()
Old code

Architecture
internet
Kafka producer
/store
Kafka consumer
Kafka queue
Postgres
… time to kill consumer ...

Asyncio, example
import asyncio
async def factorial(name, number):
f = 1
for i in range(2, number+1):
print("Task %s: Compute factorial(%s)..." % (name, i))
await asyncio.sleep(1)
f *= i
print("Task %s: factorial(%s) = %s" % (name, number, f))
loop = asyncio.get_event_loop()
tasks = [
asyncio.ensure_future(factorial("A", 2)),
asyncio.ensure_future(factorial("B", 3)),
asyncio.ensure_future(factorial("C", 4))]
loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
Task A: Compute factorial(2)...
Task B: Compute factorial(2)...
Task C: Compute factorial(2)...
Task A: factorial(2) = 2
Task B: Compute factorial(3)...
Task B: factorial(3) = 6
Task C: factorial(4) = 24

What we used
Apache Kafka
Not ujson
Concurrency - doing lots of slow things at once.
No processes, no threads.
Producer
from aiohttp import web
import json
Consumer
import asyncio
import json
from aiokafka import AIOKafkaConsumer
import aiopg

Producer #1
async def kafka_send(kafka_producer, data, topic):
message = {
'data': data,
'received': str(arrow.utcnow())
}
message_json_bytes = bytes(json.dumps(message), 'utf-8')
await kafka_producer.send_and_wait(topic, message_json_bytes)
async def handle(request):
post_data = await request.json()
try:
await kafka_send(request.app['kafka_p'], post_data, topic=settings.topic)
except:
slog.exception("Kafka Error")
await destroy_all()
return web.Response(status=200)
app = web.Application()
app.router.add_route('POST', '/store', handle)
app['kafka_p'] = get_kafka_producer()

Destroying the loop
async def destroy_all():
for task in asyncio.Task.all_tasks():
task.cancel()
await loop.stop()
await loop.close()
slog.debug("Exiting.")
sys.exit()
def get_kafka_producer():
producer = AIOKafkaProducer(
loop=loop,
bootstrap_servers=settings.queues_urls,
request_timeout_ms=settings.kafka_timeout,
retry_backoff_ms=1000)
loop.run_until_complete(producer.start())
return producer
Getting producer
Producer #2

Consume
… time to resurrect consumer ...
DB
connected
1. Receive data record from Kafka
2. Put it to the queue
start
yesno
Flush
queue full enough
or
data old enough
Store data from queue to DB
yesno
Connect to DB
start
asyncio.Queue()
Consumer #1

def main():
dbs_connected = asyncio.Future()
batch = asyncio.Queue(maxsize=settings.batch_max_size)
asyncio.ensure_future(consume(batch, dbs_connected))
asyncio.ensure_future(start_flushing(batch, dbs_connected))
loop.run_forever()
async def consume(queue, dbs_connected):
await asyncio.wait_for(dbs_connected, timeout=settings.wait_for_databases)
consumer = AIOKafkaConsumer(
settings.topic, loop=loop, bootstrap_servers=settings.queues_urls,
group_id='consumers'
)
await consumer.start()
async for msg in consumer:
message = json.loads(msg.value.decode("utf-8"))
await queue.put((message.get('received'), message.get('data')))
await consumer.stop()
Consumer #2

async def start_flushing(queue, dbs_connected):
db_logg = await aiopg.create_pool(settings.logs_db_url)
while True:
async with db_logg.acquire() as logg_conn, logg_conn.cursor() as logg_cur:
await keep_flushing(dbs_connected, logg_cur, queue)
await asyncio.sleep(2)
async def keep_flushing(dbs_connected, logg_cur, queue):
dbs_connected.set_result(True)
last_stored_time = time.time()
while True:
if not queue.empty() and (queue.qsize() > settings.batch_flush_size or
time.time() - last_stored_time > settings.batch_max_time):
to_store = []
while not queue.empty():
to_store.append(await queue.get())
try:
await store_bulk(logg_cur, to_store)
except:
break # DB down, breaking to reconnect
last_stored_time = time.time()
await asyncio.sleep(settings.batch_sleep)
Consumer #3

Code is public on gitlab
https://gitlab.skypicker.com/ondrej/faqstorer
www.orwen.org
code.kiwi.com
www.kiwi.com/jobs/
Check graphs...

Python queue solution with asyncio and kafka

More Related Content

What's hot (20)

Viewers also liked (7)

Similar to Python queue solution with asyncio and kafka (20)

Recently uploaded (20)

Python queue solution with asyncio and kafka

Editor's Notes