Chef patterns

Chef
Pa(erns
From
Building
Clusters

Biju
Nair

Boston
DevOps
Meetup

08-‐July-‐2015

Background

•  Automate
build
&
management
of
clusters

– Hadoop

– KaLa…
etc

•  Pa(erns
which
can
be
used
elsewhere

Service
On
Demand

•  Common
services
which
can
be
requested

– Copy
logs
from
applicaQons
to
a
centralized

locaQon

– Service
available
on
all
the
nodes

– ApplicaQons
can
request
the
service
dynamically

Service
On
Demand

•  Node
A(ribute
to
store
service
requests

default['bcpc']['hadoop']['copylog'] = {}
{
'app_id' => { 'logfile' => "/path/file_name_of_log_file",
'docopy' => true (or false)
},...
}
•  Data
Structure
to
make
service
requests

Service
On
Demand

•  ApplicaQon
recipes
make
service
requests

#
# Updating node attributes to copy HBase master log file to HDFS
#
node.default['bcpc']['hadoop']['copylog']['hbase_master'] = {
'logfile' => "/var/log/hbase/hbase-master-#{node.hostname}.log",
'docopy' => true
}
node.default['bcpc']['hadoop']['copylog']['hbase_master_out'] = {
'logfile' => "/var/log/hbase/hbase-master-#{node.hostname}.out",
'docopy' => true
}

Service
On
Demand

•  Service
recipe

node['bcpc']['hadoop']['copylog'].each do |id,f|
if f['docopy']
template "/etc/flume/conf/flume-#{id}.conf" do
source "flume_flume-conf.erb”
action :create ...
variables(:agent_name => "#{id}",
:log_location => "#{f['logfile']}" )
notifies :restart,"service[flume-agent-multi-#{id}]",:delayed
end
service "flume-agent-multi-#{id}" do
supports :status => true, :restart => true, :reload => false
service_name "flume-agent-multi"
action :start
start_command "service flume-agent-multi start #{id}"
restart_command "service flume-agent-multi restart #{id}"
status_command "service flume-agent-multi status #{id}"
end
•  Separate
role
at
the
end
of
run
list

Pluggable
Alerts

•  Single
source
for
monitored
stats

– Allows
users
to
visualize
stats
across
diﬀerent

parameters

– Didn’t
want
to
duplicate
the
stats
collecQon
by

alerQng
system

– Need
to
feed
data
to
the
alerQng
system
to

generate
alerts

Pluggable
Alerts

•  A(ribute
where
users
can
deﬁne
alerts

default["bcpc"]["hadoop"]["graphite"]["queries"] = {
'hbase_master' => [
{ 'type' => "jmx",
'query' => "memory.NonHeapMemoryUsage_committed",
'key' => "hbasenonheapmem",
'trigger_val' => "max(61,0)",
'trigger_cond' => "=0",
'trigger_name' => "HBaseMasterAvailability",
'trigger_dep' => ["NameNodeAvailability"],
'trigger_desc' => "HBase master seems to be down",
'severity' => 1
},{
'type' => "jmx",
'query' => "memory.HeapMemoryUsage_committed",
'key' => "hbaseheapmem",
...
},...], ’namenode' => [...] ...}

Pluggable
Alerts

•  Recipes
and
templates
use
the
data
structure

– To
generate
queries
to
pull
data
from
staQsQcs

store
and
send

•  h(ps://github.com/bloomberg/chef-‐bach/blob/master/
cookbooks/bcpc-‐hadoop/templates/default/
graphite.query_graphite.conﬁg.erb

– To
create
requested
trigger
related
objects
in

alarming
system

•  h(ps://github.com/bloomberg/chef-‐bach/blob/master/
cookbooks/bcpc-‐hadoop/recipes/graphite_to_zabbix.rb

Pluggable
Alerts

•  Servers
Deﬁned
in
role
is
used
by
recipes

"default_attributes" : {
"jmxtrans": {
"servers": [
{
"type": "hbase_master",
"service": "hbase-master",
"service_cmd": "org.apache.hadoop.hbase.master.HMaster”
}, {
"type": "hbase_rs",
"service": "hbase-regionserver",
"service_cmd":
"org.apache.hadoop.hbase.regionserver.HRegionServer"
}
]
} ...

Service
Restart

•  We
use
jmxtrans
to
monitor
jmx
stats

– Services
to
be
monitored
varies
with
node

– There
can
be
more
than
one
service
to
be

monitored

– Monitored
service
restart
requires
JMXtrans
to
be

restarted**

Service
Restart

•  Data
structure
in
roles
to
deﬁne
the
services

"default_attributes" : {
"jmxtrans": {
"servers": [
{
"type": "datanode",
"service": "hadoop-hdfs-datanode",
"service_cmd":
"org.apache.hadoop.hdfs.server.datanode.DataNode"
}, {
"type": "hbase_rs",
"service": "hbase-regionserver",
"service_cmd":
“org.apache.hadoop.hbase.regionserver.HRegionServer"
}
]
} ...

Service
Restart

•  Jmxtrans
service
restart
logic
built
dynamically

jmx_services = Array.new
jmx_srvc_cmds = Hash.new
node['jmxtrans']['servers'].each do |server|
jmx_services.push(server['service'])
jmx_srvc_cmds[server['service']] = server['service_cmd']
end
service "restart jmxtrans on dependent service" do
service_name "jmxtrans"
supports :restart => true, :status => true, :reload => true
action :restart
jmx_services.each do |jmx_dep_service|
subscribes :restart, "service[#{jmx_dep_service}]", :delayed
end
only_if {process_require_restart?("jmxtrans","jmxtrans-all.jar",
jmx_srvc_cmds)}
end

Service
Restart

def process_require_restart?(process_name, process_cmd, dep_cmds)
tgt_proces_pid = `pgrep -f #{process_cmd}`
...
tgt_proces_stime = `ps --no-header -o start_time #{tgt_process_pid}`
...
ret = false
restarted_processes = Array.new
dep_cmds.each do |dep_process, dep_cmd|
dep_pids = `pgrep -f #{dep_cmd}`
if dep_pids != ""
dep_pids_arr = dep_pids.split("n")
dep_pids_arr.each do |dep_pid|
dep_process_stime = `ps --no-header -o start_time #{dep_pid}`
if DateTime.parse(tgt_proces_stime) <
DateTime.parse(dep_process_stime)
restarted_processes.push(dep_process)
ret = true
end ...

Rolling
Restart

•  Changes
to
conﬁguraQon

•  Availability

– Toxic
ConﬁguraQon

•  ContenQon

– Poll
&
Wait

– Fail
the
Run

– Simply
Skip
Service
Restart
and
Go
On

•  Store
the
state
and
need
for
restart

•  Breaks
assumpQons
of
Procedural
Chef
Runs

Rolling
Restart

•  ZooKeeper

– Service
specific
znode
as
lock

•  Node
a(ribute
to
flag
restart
failures

h(ps://github.com/bloomberg/chef-‐bach/blob/rolling_restart/
cookbooks/bcpc-‐hadoop/definiQons/hadoop_service.rb

Logic
InjecQon

•  We
use
Community
cookbooks

– Takes
care
of
standard
install,
enable
and
starQng

of
services

•  Need
to
add
logic
to
cookbook
recipes

– Take
acQon
on
a
service
only
when
condiQons
are

saQsﬁed

– Take
acQon
on
a
service
based
on
dependent

service
state

Logic
InjecQon

kafka_install node.kafka.version_install_dir do
from kafka_target_path
not_if { kafka_installed? }
end
template ::File.join(node.kafka.config_dir, 'server.properties') do
source 'server.properties.erb’
...
helpers(Kafka::Configuration)
if restart_on_configuration_change?
notifies :restart, 'service[kafka]', :delayed
end
end
service 'kafka' do
provider kafka_init_opts[:provider]
supports start: true, stop: true, restart: true, status: true
action kafka_service_actions
end

Logic
InjecQon

•  Changes
to
standard
cookbook

– Create
a
new
recipe
to
perform
service
acQon

•  Resource
to
intercept
noQﬁcaQons
to
service
resource

•  Original
service
resource

• Add
node
attribute
which
stores
name
of
new

recipe

• Update
original
recipe

– Remove
the
service
resource
from
the
original

recipe

– Replace
it
with
include_recipe
new_a(ribute

Logic
InjecQon

•  New
recipe
to
perform
service
acQons

– First
step
is
the
ruby_block
to
intercept

noQﬁcaQons

ruby_block 'coordinate-kafka-start' do
block do
Chef::Log.debug 'Default recipe to coordinate Kafka start is used'
end
action :nothing
notifies :restart, 'service[kafka]', :delayed
end
service 'kafka' do
action kafka_service_actions
end

Logic
InjecQon

•  A(ribute
to
set
the
recipe
for
service
acQons

#
# Attribute to set the recipe to used to coordinate Kafka service star
# if nothing is set the default recipe ”_coordinate" will be used
#
default.kafka.start_coordination.recipe = 'kafka::_coordinate'

Logic
InjecQon

•  Changes
to
the
original
recipe

kafka_install node.kafka.version_install_dir do
from kafka_target_path
not_if { kafka_installed? }
end
template ::File.join(node.kafka.config_dir, 'server.properties') do
source 'server.properties.erb’
...
helpers(Kafka::Configuration)
if restart_on_configuration_change?
notifies :create,'ruby_block[coordinate-kafka-start]’,immediately
end
end
include_recipe node.kafka.start_coordination.recipe

Logic
InjecQon

•  Changes
in
wrapper
cookbook

– Create
custom
recipe
in
wrapper
cookbook

•  NoQﬁcaQon
interceptor
ruby_block
should
be
ﬁrst

•  Logic
to
determine
service
restart
acQon

•  service
resource

•  Any
clean-‐up
logic

– Overwrite
a(ribute
with
custom
recipe
name

Logic
InjecQon

ruby_block 'coordinate-kafka-start' do
block do
Chef::Log.info 'Custom recipe to coordinate Kafka start/restart'
end ...
ruby_block 'restart-coordination' do
block do
Chef::Log.info 'Implement the process to coordinate the restart'
end ...
service 'kafka' do
...
ruby_block 'restart-coordination-cleanup' do
block do
Chef::Log.info 'Implement any cleanup logic required'
end

Logic
InjecQon

•  Overwrite
a(ribute
to
set
the
custom
recipe

#
# Overwrite the community cookbook attribute with custom recipe name
#
default[:kafka][:start_coordination][:recipe] = 'kafka-bcpc::coordinate'

References

•  h(ps://github.com/bloomberg/chef-‐bach

•  h(p://blog.asquareb.com/blog/categories/
chef-‐pa(erns/

Thank
You!!

bnair@asquareb.com

Chef patterns

More Related Content

What's hot (20)

Viewers also liked (11)

Similar to Chef patterns (20)

Recently uploaded (20)

Chef patterns