Skip to content

Commit

Permalink
replace juju run and juju scp with ssh and scp
Browse files Browse the repository at this point in the history
When unit is in error or blocked state juju run will not connect.
Using ssh instead allows to collect data regardless of juju agent
status.

solves juju#28 amd juju#26
  • Loading branch information
marosg42 committed Jul 23, 2019
1 parent bc6de2b commit f9e8375
Showing 1 changed file with 41 additions and 30 deletions.
71 changes: 41 additions & 30 deletions jujucrashdump/crashdump.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import tempfile
import uuid
import yaml
import concurrent.futures
from collections import defaultdict
from os.path import expanduser
try:
Expand Down Expand Up @@ -52,10 +53,13 @@


def retrieve_single_unit_tarball(tuple_input):
unique, machine, alias_group = tuple_input
unique, machine, alias_group, all_machines = tuple_input
unit_unique = uuid.uuid4()
juju_cmd("scp %s:/tmp/juju-dump-%s.tar %s.tar"
% (machine, unique, unit_unique))
scp_cmd = 'scp -o StrictHostKeyChecking=no'
for ip in all_machines[machine]:
if run_cmd("%s %s:/tmp/juju-dump-%s.tar %s.tar"
% (scp_cmd, ip, unique, unit_unique)):
break
if '/' not in machine:
machine += '/baremetal'
run_cmd("mkdir -p %s || true" % machine)
Expand Down Expand Up @@ -121,6 +125,8 @@ def run_cmd(command, fatal=False, to_file=None):
print('Command "%s" failed' % command)
if fatal:
sys.exit(1)
return False
return True


def juju_cmd(command, *args, **kwargs):
Expand Down Expand Up @@ -153,6 +159,13 @@ def juju_storage_pools():
juju_cmd('storage-pools --format=yaml', to_file='storage_pools.yaml')


def run_ssh(host, timeout, ssh_cmd, cmd):
for ip in host:
if run_cmd("timeout {}s {} {} sudo '{}'".format(
timeout, ssh_cmd, ip, cmd)):
break


class CrashCollector(object):
"""A log file collector for juju and charms"""
def __init__(self, model, max_size, extra_dirs, output_dir=None,
Expand All @@ -176,12 +189,28 @@ def __init__(self, model, max_size, extra_dirs, output_dir=None,
self.timeout = timeout
self.journalctl = journalctl

def get_all(self):
all = {}
juju_status = yaml.load(open('juju_status.yaml', 'r'),
Loader=yaml.FullLoader)
for machine, machine_data in juju_status['machines'].iteritems():
all[machine] = machine_data['ip-addresses']
if 'containers' in juju_status['machines'][machine]:
for container, container_data in \
juju_status['machines'][machine]['containers'].iteritems():
all[container] = container_data['ip-addresses']
return all

def _run_all(self, cmd):
run_cmd("timeout %ds juju run --all -- sh -c '%s'" % (
self.timeout, cmd))
all_machines = self.get_all()
ssh_cmd = 'ssh -o StrictHostKeyChecking=no'
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
[executor.submit(run_ssh, ips, self.timeout, ssh_cmd, cmd)
for _, ips in all_machines.iteritems()]

def run_addons(self):
juju_status = yaml.load(open('juju_status.yaml', 'r'))
juju_status = yaml.load(open('juju_status.yaml', 'r'),
Loader=yaml.FullLoader)
machines = service_unit_addresses(juju_status).keys()
if not machines:
return
Expand All @@ -193,7 +222,7 @@ def run_listening(self):
pull_location = "/tmp/{uniq}/cmd_output".format(uniq=self.uniq)
self._run_all('mkdir -p {pull_location};'
'sudo netstat -taupn | grep LISTEN 2>/dev/null'
' > {pull_location}/listening.txt || true'
' | sudo tee {pull_location}/listening.txt || true'
''.format(pull_location=pull_location))

def run_journalctl(self):
Expand Down Expand Up @@ -238,29 +267,10 @@ def _append(parent, incl):
_append("addon_output", "."),
]))

@staticmethod
def __retrieve_single_unit_tarball(unique, tuple_input):
machine, alias_group = tuple_input
unit_unique = uuid.uuid4()
juju_cmd("scp %s:/tmp/juju-dump-%s.tar %s.tar"
% (machine, unique, unit_unique))
if '/' not in machine:
machine += '/baremetal'
run_cmd("mkdir -p %s || true" % machine)
try:
run_cmd("tar -pxf %s.tar -C %s" % (unit_unique, machine))
run_cmd("rm %s.tar" % unit_unique)
except IOError:
# If you are running crashdump as a machine is coming
# up, or scp fails for some other reason, you won't
# have a tarball to move. In that case, skip, and try
# fetching the tarball for the next machine.
print("Unable to retrieve tarball for %s. Skipping." % machine)
for alias in alias_group:
os.symlink('%s' % machine, '%s' % alias.replace('/', '_'))

def retrieve_unit_tarballs(self):
juju_status = yaml.load(open('juju_status.yaml', 'r'))
all_machines = self.get_all()
juju_status = yaml.load(open('juju_status.yaml', 'r'),
Loader=yaml.FullLoader)
aliases = service_unit_addresses(juju_status)
if not aliases:
# Running against an empty model.
Expand All @@ -269,7 +279,8 @@ def retrieve_unit_tarballs(self):
pool = multiprocessing.Pool()
pool.map(
retrieve_single_unit_tarball,
[(self.uniq, key, value) for key, value in aliases.items()]
[(self.uniq, key, value, all_machines)
for key, value in aliases.items()]
)

def collect(self):
Expand Down

0 comments on commit f9e8375

Please sign in to comment.