#!/bin/sh
#
# "/usr/bin/env python3 -u" doesn't work as expected, so we use this
# shell hack to invoke Python in unbuffered mode.
#
''''exec python3 -u -- "$0" ${1+"$@"} # '''

"""
This program does very basic troubleshooting of one or two systems
running pScheduler, veryfying that the service is available and
running several tests to verify that the basic features of the system
are working.
"""

import babel.dates
import datetime
import ipaddress
import optparse
import os
import re
import shlex
import socket
import sys
import time

import pscheduler


pscheduler.set_graceful_exit()

#
# Gargle the arguments
#

class VerbatimParser(optparse.OptionParser):
    def format_epilog(self, formatter):
        return self.epilog

opt_parser = VerbatimParser(
    usage="Usage: %prog [ OPTIONS ] host",
    epilog=

"""
Examples:

  troubleshoot
      Troubleshoot the local system

  troubleshoot --host ps.example.com
      Troubleshoot ps.example.com

  troubleshoot ps2.example.com
      Troubleshoot the local host and ps2.example.com

  troubleshoot --host ps.example.com ps2.example.com
      Troubleshoot ps.example.com and ps2.example.com
"""
    )
opt_parser.disable_interspersed_args()

DEFAULT_BASE_HOST = pscheduler.api_local_host()

opt_parser.add_option("--host",
                      help="Base host for troubleshooting",
                      default=DEFAULT_BASE_HOST,
                      action="store", type="string",
                      dest="host")

opt_parser.add_option("--ip-version",
                      help="IP version to use (4 or 6)",
                      default=None,
                      action="store", type="int",
                      dest="ip_version")

opt_parser.add_option("--quick",
                      help="Don't do anything time-consuming",
                      default=False,
                      action="store_true",
                      dest="quick")

opt_parser.add_option("--skip-single",
                      help="Skip most single-participant diagnostics",
                      default=False,
                      action="store_true",
                      dest="skip_single")

opt_parser.add_option("--stats",
                      help="Dump server statistics",
                      default=False,
                      action="store_true",
                      dest="stats")

(options, remaining_args) = opt_parser.parse_args()

if len(remaining_args) > 1:
    opt_parser.print_usage()
    pscheduler.fail()


if options.ip_version is not None and options.ip_version not in [4, 6]:
    pscheduler.fail("IP version must be 4 or 6.")


host_a = options.host
hosts = [host_a]
try:
    host_z = remaining_args[0]
except IndexError:
    host_z = None

if host_z is not None:
    if host_z == host_a:
        print()
        print("Both hosts are the same; assuming you meant to troubleshoot only one.")
        print()
    else:
        hosts.append(host_z)



if options.skip_single and host_z is None:
    pscheduler.fail("Nothing to do on a single host with --skip-single enabled.")



# TODO: This would be a nice "narrator" class.


def _write(text):
    sys.stdout.write(text)
    sys.stdout.flush()

def start(text):
    _write("%s..." % (text))

def progress(text=None):
    if text is None:
        _write(".")
    else:
        start(" %s" % (text))

def ok(message="OK."):
    _write(" %s\n" % (message))

def failed(why, fail=True):
    _write(" Failed.\n\n")
    print(why)
    if fail:
        pscheduler.fail()



def failed_diags(run, why="Task failed to run properly."):
    """
    Fail and write diagnostic data for a run
    """
    failed(why, fail=False)
    print()
    result_args = ["pscheduler", "result", "--diags", "--archivings", run]
    os.execl("/bin/sh", "/bin/sh", "-c",
             " ".join([shlex.quote(arg) for arg in result_args]))




def check_mtu(host_a, host_z=None):
    """Report on the MTU between hosts"""

    if host_z is None:
        host_z = host_a
        host_a = None

    start("  Measuring MTU")
    status, result = pscheduler.url_get(
        pscheduler.api_url(host_a, "/mtu-safe"),
        params={ "dest": host_z },
        throw=False)
    if status == 200:
        if result["safe"]:
            ok(result["message"])
        else:
            ok("Unsafe or unknown: %s" % (result["message"]))
    elif status == 404:
        ok("Not supported")
    elif status == 504:
        ok("Unable to measure: timed out")
    else:
        failed(result)



# Tuples containing the statistics to fetch with a label and URL
# relative to /stat.  A URL of None means the entry is a section
# header.
statistics = [
    ("Archiving", None),
    ("Backlog", "archiving/backlog"),
    ("Upcoming", "archiving/upcoming"),

    ("HTTP Queue", None),
    ("Backlog", "http-queue/backlog"),
    ("Length", "http-queue/length"),

    ("Runs", None),
    ("Pending", "runs/pending"),
    ("On Deck", "runs/on-deck"),
    ("Running", "runs/running"),
    ("Cleanup", "runs/cleanup"),
    ("Finished", "runs/finished"),
    ("Overdue", "runs/overdue"),
    ("Missed", "runs/missed"),
    ("Failed", "runs/failed"),
    ("Preempted", "runs/preempted"),
    ("Non-Starting", "runs/nonstart"),
]



# Services run by pScheduler
services = [
    ("ticker", "Ticker"),
    ("scheduler", "Scheduler"),
    ("runner", "Runner"),
    ("archiver", "Archiver")
]

def check_service_status(api):
    """
    Check on the status of the services
    """

    if api < 4:
        # Older servers don't have this.
        return

    start("  Fetching service status")

    http_status, status  = pscheduler.url_get(
        pscheduler.api_url(host, "status"),
        throw=False)
    if http_status != 200:
        failed(status)
    if status is None:
        failed("Server did not provide full status.")
    ok()

    start("  Checking services")
    if status.get("services", None) is None:
        failed("Server did not provide service status")

    for service, name in services:
        progress(name)
        try:
            service_status = status["services"][service]
            if service_status["overdue"] is not None:
                failed("The %s service does not appear to be running." % name)
        except KeyError:
            failed("pScheduler did not report status for the %s service." % name)
    ok()

    # Look at the limits
    if "limits" in status:
        start("  Checking limits")
        limit_status = status["limits"]

        if limit_status.get("ok", False):
            ok()
        else:
            failed(limit_status.get("error", "Unspecified error"))

    # Look at the schedule
    if "runs" in status:
        for check in [
                ('last-scheduled', '  Last run scheduled'),
                ('last-finished', '  Last run completed')
                ]:
            item, label = check
            start(label)
            last = status['runs'].get(item)
            if last is None:
                ok('Never')
            else:
                last_delta = pscheduler.iso8601_as_datetime(last) \
                - pscheduler.time_now()
                ok(babel.dates.format_timedelta(last_delta, add_direction=True))


def dump_stats(host):
    """
    Acquire and dump statistics for a host
    """
    print("  Server Statistics:")

    # Figure out how much padding to add
    longest_label =  sorted(
        [len(p[0]) for p in [p for p in statistics if p[1] is not None]]
    )[-1]

    last_header = ""

    for stat in statistics:
        label, fetch = stat

        # Handle section headers
        if fetch is None:
            print("    %s" % (label))
            last_header = label
            continue

        print("      %s %s..." % (
            label, "." * (longest_label - len(label)) ), end=' ')

        url = pscheduler.api_url(host, "/stat/%s" % (fetch))

        status, result = pscheduler.url_get(url, throw=False, json=False)

        if status == 404:
            status = 200
            result = "N/A"
        elif status != 200:
            print("FAILED")
            pscheduler.fail("Unable to retrieve %s %s: %s: %s" \
                            % (last_header, label, status, result))

        print("%6s" % (result.strip()))


def run_task(message, lead, task, bind=None):
    """
    Run a task and return the result.
    """
    start(message)

    # Post

    tasks_url = pscheduler.api_url(lead, '/tasks')
    status, task_url = pscheduler.url_post(
        tasks_url,
        data=task,
        bind=bind,
        throw=False)

    if status != 200:
        failed(f'Failed to post task. Code: [{status}] Error: [{task_url}]')

    # Fetch posted task

    status, task_data = pscheduler.url_get("%s?detail" % (task_url), throw=False)
    if status != 200:
        failed(f'Failed to get task data. Code: [{status}] Error: [{task_data}]')

    try:
        first_run_url = task_data["detail"]["first-run-href"]
    except KeyError:
        failed('Something has gone terribly wrong. Server returned incomplete data for the task. [first-run-href] is missing.')

    progress()


    # Get first run and make sure we have what we need to function.

    status, run_data = pscheduler.url_get(first_run_url, throw=False)

    if status == 404:
        failed('The server never scheduled a run for the task. The server is too busy or there is a problem with [pscheduler-scheduler] service.')
    if status != 200:
        failed(f'Error getting first run data: Code: [{status}] Error: [{run_data}]')

    for key in ["end-time", "result-href"]:
        if key not in run_data:
            failed(f'Something has gone terribly wrong. Server did not return [{key}] with run data')

    # Wait for the end time to pass

    try:
        end_time = pscheduler.iso8601_as_datetime(run_data["end-time"])
    except ValueError as ex:
        failed(f'Something has gone terribly wrong. Server did not return a valid [end-time] for the task: {str(ex)}')

    sleep_time = pscheduler.time_until_seconds(end_time)

    progress("%d seconds" % (sleep_time))
    time.sleep(sleep_time)

    # Wait for the result to happen (or not)

    status, run_data = pscheduler.url_get(first_run_url,
                                          params={"wait-merged": True},
                                          throw=False)

    # Whatever the outcome, grab a fresh copy of the run so we can get its status.
    second_status, second_run_data = pscheduler.url_get(first_run_url, throw=False)
    if second_status == 404:
        failed('Test was scheduled but not run. Check that the [pscheduler-runner] service is running.')
    if second_status != 200:
        failed(f'Unable to get run data: Code: [{second_status}] Error: [{second_run_data}]')

    extra_comment = progress(
        second_run_data.get("state-display")
        + (", probably missed" if second_run_data.get("state") == "pending" else "")
    )

    if status == 404:
        failed('Test was scheduled but not run. Check that the [pscheduler-runner] service is running.')
    if status != 200:
        failed(f'Error while waiting for result. Code: [{status}] Error: [{run_data}]')


    # Get the result

    status, result_data = pscheduler.url_get(run_data["result-href"],
                                             params={"wait-merged": True},
                                             throw=False)
    if status == 404:
        failed('Test was scheduled but not run. Check that the [pscheduler-runner] service is running.')
    if status != 200:
        failed(f'Did not get a result. Code: [{status}] Error: [{result_data}]')

    try:
        if not result_data["succeeded"]:
            failed_diags(first_run_url)
    except KeyError:
        failed('Something has gone terribly wrong. Server didn\'t return the expected result. [succeeded] is missing from the result.')

    # If there were archivings, see if they worked.

    if "archives" in task and len(task["archives"]):

        progress("Checking archiving")

        end_time = time.time() + 5
        while time.time() < end_time:
            status, archive_data = pscheduler.url_get(first_run_url,
                                                      throw=False)
            if status != 200:
                failed(f'Unable to fetch archive status. Code: [{status}] Error: [{archive_data}]')

            archived = [ True
                         for ar in archive_data["archivings"]
                         # This is a fallback for older systems
                         if ar.get("completed", ar.get("archived", False))
            ]

            if len(archived) == len(task["archives"]):
                break

            time.sleep(1)

        if time.time() >= end_time:
            failed("Archiving never completed.")

        # At this point, archive_data will be the last one pulled from
        # the server.

        completed = [ True
                     for ar in archive_data["archivings"]
                     if ar.get("archived", False)
                 ]

        if len(completed) < len(task["archives"]):
            failed_diags(first_run_url, why="One or more archivings failed.")

    ok()
    return result_data



print("Performing basic troubleshooting of %s." \
    % (" and ".join(hosts)))





#
# The Basics
#

clocks = {}

for host in hosts:
    print()
    print("%s:" % (host))
    print()

    # Try to resolve the host if it's not already an IP address
    try:
        host_ip = ipaddress.ip_address(host)
    except ValueError:
        host_ip = None
    if host_ip is None:
        start(f'  Checking that host "{host}" resolves')
        resolved = pscheduler.dns_resolve(host, ip_version=options.ip_version)
        if resolved is None:
            failed('Not resolvable or timed out')
        ok(resolved)


    if not options.quick and not options.skip_single:
        check_mtu(host)


    start("  Looking for pScheduler")
    up, reason = pscheduler.api_ping(host)
    if up:
        ok()
    else:
        failed(reason.strip())


    start("  Fetching API level")
    status, api = pscheduler.url_get(
        pscheduler.api_url(host, "api"),
        throw=False)
    if status != 200:
        failed(api)
    ok(api)


    start("  Checking clock")
    status, result = pscheduler.url_get(
        pscheduler.api_url(host, "clock"),
        throw=False)

    if status != 200:
        failed(result)
    if result["synchronized"]:
        ok()
    else:
        if host_z is None:
            ok("Unsynchronized (Not considered fatal)")
        else:
            if host == host_a:
                ok("Unsynchronized  (See check against %s)" % (host_z))
            else:
                ok("Unsynchronized")

    # Hold this for later two-host comparison
    clocks[host] = result

    if options.skip_single:
        continue

    start("  Exercising API")
    for test in [
            ("archivers", "Archivers", False),
            ("contexts", "Contexts", False),
            ("tests", "Tests", False),
            ("tools", "Tools", False),
    ]:
        endpoint, title, ok_404 = test
        progress(title)
        status, result = pscheduler.url_get(
            pscheduler.api_url(host, endpoint),
            throw=False)
        if status not in [ 200, 404 ] or (status == 404 and not ok_404):
            failed(result)
    ok()


    check_service_status(api)

    if options.stats:
        dump_stats(host)


    if not options.quick:
        run_task("  Idle test", host,
                 {
                     "schema": 1,
                     "test": {
                         "spec": {
                             "duration": "PT1S",
                             "schema": 1
                         },
                         "type": "idle"
                     },
                     "schedule": {},
                     "archives": [
                         {
                             "archiver": "bitbucket",
                             "data": {},
			     "transform": {
				 "script": "."
			     },
                             "ttl": "PT1M"
                         },
                         {
                             "archiver": "bitbucket",
                             "data": {},
			     "transform": {
				 "script": "."
			     },
                             "ttl": "PT1M"
                         }
                     ],
                 }
             )



if len(hosts) == 1:
    print()
    pscheduler.succeed("pScheduler appears to be functioning normally.")


#
# Remote pScheduler
#

print()
print("%s and %s:" % (host_a, host_z))
print()


# Make sure both hosts resolve to the same address family

start("  Checking IP addresses")
(stripped_a, stripped_a_port) = pscheduler.api_host_port(host_a)
(stripped_z, stripped_z_port) = pscheduler.api_host_port(host_z)
normalized_a, normalized_z = pscheduler.ip_normalize_version(stripped_a, stripped_z, ip_version=options.ip_version)

if normalized_a is None or normalized_z is None:
    ok("Unable to find %s addresses for both hosts.  This may not be a problem on %s." % (
        "same-family" if options.ip_version is None else "IPv%d" % (options.ip_version), host_a
    ))
else:
    normalized_version, _normalized_ip = pscheduler.ip_addr_version(normalized_a)
    if options.ip_version is not None and (options.ip_version != normalized_version):
        ok("IPs are version %d, version %s was requested.  This may not be a problem on %s."
             % (normalized_version, options.ip_version, host_a))
    else:
        ok()


# MTU Between hosts

if not options.quick:
    check_mtu(stripped_a, stripped_z)


# Compare clocks

start("  Checking timekeeping")

try:
    time_diff = pscheduler.iso8601_as_datetime(clocks[host_a]["time"]) \
                - pscheduler.iso8601_as_datetime(clocks[host_z]["time"])

    clock_difference = pscheduler.timedelta_as_seconds(time_diff)
except KeyError:
    failed("Clock tests did not yield a valid result.")

if clock_difference > 1.0:
    failed("Clocks differ between hosts by %f seconds" % (abs(clock_difference)))
ok()


# Simplestream

if not options.quick:
    simplestream_task =  {
        "schema": 1,
        "test": {
            "spec": {
                "source-node": host_a,
                "dest": host_z,
                "schema": 1
            },
            "type": "simplestream"
        },
        "schedule": {}
    }

    if options.ip_version is not None:
        simplestream_task["test"]["spec"]["ip-version"] = options.ip_version

    simplestream_result = run_task(
        "  Simple stream test", host_a, simplestream_task
    )


#
# The End.
#

print()
pscheduler.succeed("pScheduler on both hosts appears to be functioning normally.")
