diff --git a/birb_scheduler/config.py b/birb_scheduler/config.py index eefdd7c..30dcdd8 100644 --- a/birb_scheduler/config.py +++ b/birb_scheduler/config.py @@ -35,3 +35,12 @@ class Config: REDIS_URL = os.environ["REDIS_URL"] COLLECTOR_URL = os.environ["COLLECTOR_URL"] # http://example.com/report/ + + # how many times an incraising queue must be observed for an action to be taken + RESCHEDULE_TIRGGER_LEVEL = int(os.environ.get("RESCHEDULE_TIRGGER_LEVEL", 5)) + + # The counter clears itself after some time + RESCHEDULE_TRIGGER_COUNTER_TTL = int(os.environ.get("RESCHEDULE_TRIGGER_COUNTER_TTL", 60)) + + # how long a mark should live on a site which had troubled recently + RECENT_TROUBLE_TTL = int(os.environ.get("RECENT_TROUBLE_TTL", 120)) diff --git a/birb_scheduler/run_scheduler.py b/birb_scheduler/run_scheduler.py index ced29d1..878bc6c 100644 --- a/birb_scheduler/run_scheduler.py +++ b/birb_scheduler/run_scheduler.py @@ -8,10 +8,6 @@ from k8s_buzerator import ensure_running_pod_on_site from urllib.parse import urljoin -RESCHEDULE_TIRGGER_LEVEL = 5 # how many times an incraising queue must be observed for an action to be taken -RESCHEDULE_TRIGGER_COUNTER_TTL = 60 # The counter clears itself after some time -RECENT_TROUBLE_TTL = 120 # how long a mark should live on a site which had trouble recently - def run(redis_client: Redis, site_url_map: Dict[str, str]): run_count = redis_client.incr("RUNCOUNT") @@ -31,17 +27,16 @@ def run(redis_client: Redis, site_url_map: Dict[str, str]): key = f"INCRAISINGQUEUE:{site_name}" incraising_queue_detected_times = redis_client.incr(key) - if incraising_queue_detected_times > RESCHEDULE_TIRGGER_LEVEL: + if incraising_queue_detected_times > Config.RESCHEDULE_TIRGGER_LEVEL: logging.debug(f"Tirgger level reached at {site_name}") redis_client.delete(key) incraising_queue_at.append(site_name) else: - logging.debug( - f"Suspicious queue size change at {site_name} ({incraising_queue_detected_times}/{RESCHEDULE_TIRGGER_LEVEL})") - redis_client.expire(key, RESCHEDULE_TRIGGER_COUNTER_TTL) # Probably extend lifetime + logging.debug(f"Suspicious queue size change at {site_name} ({incraising_queue_detected_times}/{Config.RESCHEDULE_TIRGGER_LEVEL})") + redis_client.expire(key, Config.RESCHEDULE_TRIGGER_COUNTER_TTL) # Probably extend lifetime # decide on default for the first time - if run_count > RESCHEDULE_TIRGGER_LEVEL * 2: + if run_count > Config.RESCHEDULE_TIRGGER_LEVEL * 2: default_site = redis_client.get("DEFAULT:SCHEDULED") if not default_site: logging.debug("Default site is not set. Selecting one...") @@ -89,7 +84,7 @@ def run(redis_client: Redis, site_url_map: Dict[str, str]): # If attention required, schedule a single workload to one tier lower for site_seeking_attention in incraising_queue_at: - redis_client.set(f"RECENTTROUBLE:{site_seeking_attention}", b"\x01", ex=RECENT_TROUBLE_TTL) + redis_client.set(f"RECENTTROUBLE:{site_seeking_attention}", b"\x01", ex=Config.RECENT_TROUBLE_TTL) if current_scheduling_table_counters[site_seeking_attention] == 0: logging.warning("Wtf? Site reporting trouble, but there are no workload scheduled to it... nothing to do") continue diff --git a/k8s/scheduler.yml b/k8s/scheduler.yml index e379f88..924790c 100644 --- a/k8s/scheduler.yml +++ b/k8s/scheduler.yml @@ -36,6 +36,7 @@ data: DEBUG: "yes" REDIS_URL: "redis://birb-scheduler-redis:6379/0" COLLECTOR_URL: "http://birb-latency-collector/report/" + RECENT_TROUBLE_TTL: "240" --- kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1