podlaunch/main.py

199 lines
7.8 KiB
Python

import datetime
import json
import logging
import os
import pathlib
import random
import sys
import threading
import traceback
from queue import SimpleQueue
from signal import signal, SIGHUP, SIGINT, SIGTERM, setitimer, SIGALRM, ITIMER_REAL, SIGUSR1, SIGUSR2, strsignal
import click
import sh
SERVICES_BASE_PATH = "/infra/services/"
POD_CHECK_TIME = 120.0
RAND_OFFSET_MAX_SECONDS = 10.0
shlog = sh.bake(_out=sys.stdout, _err=sys.stderr)
sdnotify = sh.Command("systemd-notify")
class PodKeeper:
def __init__(self, network, log_driver, log_level, replace, remove, identifier):
self.podnet_args = ()
self.podnet_args += ("--network", network) if network else ()
self.podnet_args += ("--log-driver", log_driver) if log_driver else ()
self.podnet_args += ("--log-level", log_level) if log_level else ()
self.replace = replace
self.remove = remove
identifier_path = pathlib.PurePath(identifier)
if len(identifier_path.parts) != 1:
raise ValueError(f"identifier has path parts: {identifier_path}")
self.podhome = pathlib.Path(SERVICES_BASE_PATH) / identifier_path
if not self.podhome.exists():
raise NotADirectoryError(f"pod home does not exist: {self.podhome}")
self.podname = f"{identifier}_pod"
self.podyaml = f"pod-{identifier}.yaml"
podyaml_complete = (self.podhome / self.podyaml)
if not podyaml_complete.exists():
raise FileNotFoundError(f"pod definition does not exist: {podyaml_complete}")
self.stopping = threading.Event()
self.reloading = threading.Event()
self.checking = threading.Event()
self.waiter = threading.Event()
self.last_check = datetime.datetime.now(datetime.UTC)
self.passing_signal = threading.Event()
self.pass_signal_nums = SimpleQueue()
def destroy(self, signum, stackframe):
print("Destroy signal", signum, file=sys.stderr, flush=True)
self.stopping.set()
self.waiter.set()
def reload(self, signum, stackframe):
print("Reload signal", signum, file=sys.stderr, flush=True)
self.reloading.set()
self.waiter.set()
def check(self, signum, stackframe):
self.checking.set()
self.waiter.set()
def passthrough(self, signum, stackframe):
self.pass_signal_nums.put(item=signum, block=True, timeout=3)
self.passing_signal.set()
self.waiter.set()
def run(self):
os.chdir(self.podhome)
if self.replace and sh.podman.pod.exists(self.podname, _ok_code=[0, 1], _return_cmd=True).exit_code == 0:
print(f"Replacing existing pod {self.podname}", file=sys.stderr, flush=True)
shlog.podman.pod.stop(self.podname)
shlog.podman.pod.rm("-f", self.podname)
print(f"Starting pod {self.podname} at {self.last_check}", file=sys.stderr, flush=True)
shlog.podman.play.kube(self.podyaml, *self.podnet_args)
try:
shlogger = logging.getLogger("sh.command")
oldlevel = shlogger.level
shlogger.setLevel(logging.ERROR)
if 'NOTIFY_SOCKET' in os.environ:
sdnotify("--ready", f"--pid={os.getpid()}", "--status=Monitoring pod...")
while not self.stopping.is_set():
self.waiter.wait()
self.waiter.clear()
if self.passing_signal.is_set():
self.passing_signal.clear()
while not self.pass_signal_nums.empty():
signum = self.pass_signal_nums.get(block=True, timeout=2)
self.signal_pod(signum)
if self.checking.is_set():
self.checking.clear()
self.check_pod()
if self.reloading.is_set():
self.reloading.clear()
self.signal_pod(SIGHUP)
if 'NOTIFY_SOCKET' in os.environ:
sdnotify("--status=Stopping pod")
logging.getLogger("sh.command").setLevel(oldlevel)
finally:
self.stop_pod()
def signal_pod(self, signum):
print(f"Sending signal '{strsignal(signum)}' to pod {self.podname}", file=sys.stderr, flush=True)
try:
shlog.podman.pod.kill("--signal", str(signum), self.podname)
except sh.ErrorReturnCode:
print("Error signaling pod", file=sys.stderr, flush=True)
traceback.print_exc()
def check_pod(self):
new_timestamp = datetime.datetime.now(datetime.UTC)
inspect_command = sh.podman.pod.inspect(self.podname, _return_cmd=True)
multiple_descriptions = json.loads(inspect_command.stdout)
if not multiple_descriptions:
print(f"No pod descriptions found for {self.podname}", file=sys.stderr, flush=True)
self.stopping.set()
return
assert len(multiple_descriptions) == 1, f"Single pod description expected for {self.podname}"
pod_description = multiple_descriptions[0]
for container in pod_description["Containers"]:
if container["State"] != "running":
print(f"Container {container['Name']} exited", file=sys.stderr, flush=True)
logs_since = self.last_check - datetime.timedelta(seconds=POD_CHECK_TIME)
print(f"Log since last check (-{POD_CHECK_TIME}s):\n", file=sys.stderr, flush=True)
shlog.podman.logs('--since', logs_since.isoformat(), container['Name'], _out=sys.stderr)
self.stopping.set()
self.last_check = new_timestamp
def stop_pod(self):
print("Stopping pod", self.podname, file=sys.stderr, flush=True)
try:
shlog.podman.pod.stop("-t", "19", self.podname)
successful_stopped = True
except sh.ErrorReturnCode:
print(f"First stop of {self.podname} was not successful!", file=sys.stderr, flush=True)
successful_stopped = False
try:
shlog.podman.pod.stop("-t", "5", self.podname)
except sh.ErrorReturnCode:
if not successful_stopped:
print(f"Second stop of {self.podname} was not successful!", file=sys.stderr, flush=True)
if self.remove:
try:
shlog.podman.pod.rm(self.podname)
except sh.ErrorReturnCode:
print(f"Removal of {self.podname} was not successful!", file=sys.stderr, flush=True)
@click.command()
@click.option("--network", default="brodge", help="Network for the created pod")
@click.option("--log-driver", default="journald", help="Logging driver for the created pod")
@click.option("--log-level", default="", help="Controls log-level on podman call")
@click.option("--replace/--no-replace", default=True, help="Controls replacement of previously running pod with the "
"same name")
@click.option("--remove/--keep", default=True, help="Controls removal of pod after stopping")
@click.option("--verbose", is_flag=True, default=False, help="Enable DEBUG logging")
@click.argument("identifier")
def main(network, log_driver, log_level, replace, remove, verbose, identifier):
if verbose:
logging.root.setLevel(logging.DEBUG)
else:
logging.root.setLevel(logging.INFO)
keeper = PodKeeper(
network=network,
log_driver=log_driver,
log_level=log_level,
replace=replace,
remove=remove,
identifier=identifier
)
signal(SIGINT, keeper.destroy)
signal(SIGTERM, keeper.destroy)
signal(SIGHUP, keeper.reload)
signal(SIGALRM, keeper.check)
signal(SIGUSR1, keeper.passthrough)
signal(SIGUSR2, keeper.passthrough)
random_offset = random.random() * RAND_OFFSET_MAX_SECONDS
setitimer(ITIMER_REAL, 1.0 + random_offset, POD_CHECK_TIME + random_offset)
keeper.run()
if __name__ == '__main__':
logging.basicConfig()
main()