Added more knobs
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
Pünkösd Marcell 2021-12-13 23:03:13 +01:00
parent 3c83d9370f
commit e7c88bacc6
3 changed files with 15 additions and 10 deletions

View File

@ -35,3 +35,12 @@ class Config:
REDIS_URL = os.environ["REDIS_URL"]
COLLECTOR_URL = os.environ["COLLECTOR_URL"] # http://example.com/report/
# how many times an incraising queue must be observed for an action to be taken
RESCHEDULE_TIRGGER_LEVEL = int(os.environ.get("RESCHEDULE_TIRGGER_LEVEL", 5))
# The counter clears itself after some time
RESCHEDULE_TRIGGER_COUNTER_TTL = int(os.environ.get("RESCHEDULE_TRIGGER_COUNTER_TTL", 60))
# how long a mark should live on a site which had troubled recently
RECENT_TROUBLE_TTL = int(os.environ.get("RECENT_TROUBLE_TTL", 120))

View File

@ -8,10 +8,6 @@ from k8s_buzerator import ensure_running_pod_on_site
from urllib.parse import urljoin
RESCHEDULE_TIRGGER_LEVEL = 5 # how many times an incraising queue must be observed for an action to be taken
RESCHEDULE_TRIGGER_COUNTER_TTL = 60 # The counter clears itself after some time
RECENT_TROUBLE_TTL = 120 # how long a mark should live on a site which had trouble recently
def run(redis_client: Redis, site_url_map: Dict[str, str]):
run_count = redis_client.incr("RUNCOUNT")
@ -31,17 +27,16 @@ def run(redis_client: Redis, site_url_map: Dict[str, str]):
key = f"INCRAISINGQUEUE:{site_name}"
incraising_queue_detected_times = redis_client.incr(key)
if incraising_queue_detected_times > RESCHEDULE_TIRGGER_LEVEL:
if incraising_queue_detected_times > Config.RESCHEDULE_TIRGGER_LEVEL:
logging.debug(f"Tirgger level reached at {site_name}")
redis_client.delete(key)
incraising_queue_at.append(site_name)
else:
logging.debug(
f"Suspicious queue size change at {site_name} ({incraising_queue_detected_times}/{RESCHEDULE_TIRGGER_LEVEL})")
redis_client.expire(key, RESCHEDULE_TRIGGER_COUNTER_TTL) # Probably extend lifetime
logging.debug(f"Suspicious queue size change at {site_name} ({incraising_queue_detected_times}/{Config.RESCHEDULE_TIRGGER_LEVEL})")
redis_client.expire(key, Config.RESCHEDULE_TRIGGER_COUNTER_TTL) # Probably extend lifetime
# decide on default for the first time
if run_count > RESCHEDULE_TIRGGER_LEVEL * 2:
if run_count > Config.RESCHEDULE_TIRGGER_LEVEL * 2:
default_site = redis_client.get("DEFAULT:SCHEDULED")
if not default_site:
logging.debug("Default site is not set. Selecting one...")
@ -89,7 +84,7 @@ def run(redis_client: Redis, site_url_map: Dict[str, str]):
# If attention required, schedule a single workload to one tier lower
for site_seeking_attention in incraising_queue_at:
redis_client.set(f"RECENTTROUBLE:{site_seeking_attention}", b"\x01", ex=RECENT_TROUBLE_TTL)
redis_client.set(f"RECENTTROUBLE:{site_seeking_attention}", b"\x01", ex=Config.RECENT_TROUBLE_TTL)
if current_scheduling_table_counters[site_seeking_attention] == 0:
logging.warning("Wtf? Site reporting trouble, but there are no workload scheduled to it... nothing to do")
continue

View File

@ -36,6 +36,7 @@ data:
DEBUG: "yes"
REDIS_URL: "redis://birb-scheduler-redis:6379/0"
COLLECTOR_URL: "http://birb-latency-collector/report/"
RECENT_TROUBLE_TTL: "240"
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1