Added more knobs
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
Pünkösd Marcell 2021-12-13 23:03:13 +01:00
parent 3c83d9370f
commit e7c88bacc6
3 changed files with 15 additions and 10 deletions

View File

@ -35,3 +35,12 @@ class Config:
REDIS_URL = os.environ["REDIS_URL"] REDIS_URL = os.environ["REDIS_URL"]
COLLECTOR_URL = os.environ["COLLECTOR_URL"] # http://example.com/report/ COLLECTOR_URL = os.environ["COLLECTOR_URL"] # http://example.com/report/
# how many times an incraising queue must be observed for an action to be taken
RESCHEDULE_TIRGGER_LEVEL = int(os.environ.get("RESCHEDULE_TIRGGER_LEVEL", 5))
# The counter clears itself after some time
RESCHEDULE_TRIGGER_COUNTER_TTL = int(os.environ.get("RESCHEDULE_TRIGGER_COUNTER_TTL", 60))
# how long a mark should live on a site which had troubled recently
RECENT_TROUBLE_TTL = int(os.environ.get("RECENT_TROUBLE_TTL", 120))

View File

@ -8,10 +8,6 @@ from k8s_buzerator import ensure_running_pod_on_site
from urllib.parse import urljoin from urllib.parse import urljoin
RESCHEDULE_TIRGGER_LEVEL = 5 # how many times an incraising queue must be observed for an action to be taken
RESCHEDULE_TRIGGER_COUNTER_TTL = 60 # The counter clears itself after some time
RECENT_TROUBLE_TTL = 120 # how long a mark should live on a site which had trouble recently
def run(redis_client: Redis, site_url_map: Dict[str, str]): def run(redis_client: Redis, site_url_map: Dict[str, str]):
run_count = redis_client.incr("RUNCOUNT") run_count = redis_client.incr("RUNCOUNT")
@ -31,17 +27,16 @@ def run(redis_client: Redis, site_url_map: Dict[str, str]):
key = f"INCRAISINGQUEUE:{site_name}" key = f"INCRAISINGQUEUE:{site_name}"
incraising_queue_detected_times = redis_client.incr(key) incraising_queue_detected_times = redis_client.incr(key)
if incraising_queue_detected_times > RESCHEDULE_TIRGGER_LEVEL: if incraising_queue_detected_times > Config.RESCHEDULE_TIRGGER_LEVEL:
logging.debug(f"Tirgger level reached at {site_name}") logging.debug(f"Tirgger level reached at {site_name}")
redis_client.delete(key) redis_client.delete(key)
incraising_queue_at.append(site_name) incraising_queue_at.append(site_name)
else: else:
logging.debug( logging.debug(f"Suspicious queue size change at {site_name} ({incraising_queue_detected_times}/{Config.RESCHEDULE_TIRGGER_LEVEL})")
f"Suspicious queue size change at {site_name} ({incraising_queue_detected_times}/{RESCHEDULE_TIRGGER_LEVEL})") redis_client.expire(key, Config.RESCHEDULE_TRIGGER_COUNTER_TTL) # Probably extend lifetime
redis_client.expire(key, RESCHEDULE_TRIGGER_COUNTER_TTL) # Probably extend lifetime
# decide on default for the first time # decide on default for the first time
if run_count > RESCHEDULE_TIRGGER_LEVEL * 2: if run_count > Config.RESCHEDULE_TIRGGER_LEVEL * 2:
default_site = redis_client.get("DEFAULT:SCHEDULED") default_site = redis_client.get("DEFAULT:SCHEDULED")
if not default_site: if not default_site:
logging.debug("Default site is not set. Selecting one...") logging.debug("Default site is not set. Selecting one...")
@ -89,7 +84,7 @@ def run(redis_client: Redis, site_url_map: Dict[str, str]):
# If attention required, schedule a single workload to one tier lower # If attention required, schedule a single workload to one tier lower
for site_seeking_attention in incraising_queue_at: for site_seeking_attention in incraising_queue_at:
redis_client.set(f"RECENTTROUBLE:{site_seeking_attention}", b"\x01", ex=RECENT_TROUBLE_TTL) redis_client.set(f"RECENTTROUBLE:{site_seeking_attention}", b"\x01", ex=Config.RECENT_TROUBLE_TTL)
if current_scheduling_table_counters[site_seeking_attention] == 0: if current_scheduling_table_counters[site_seeking_attention] == 0:
logging.warning("Wtf? Site reporting trouble, but there are no workload scheduled to it... nothing to do") logging.warning("Wtf? Site reporting trouble, but there are no workload scheduled to it... nothing to do")
continue continue

View File

@ -36,6 +36,7 @@ data:
DEBUG: "yes" DEBUG: "yes"
REDIS_URL: "redis://birb-scheduler-redis:6379/0" REDIS_URL: "redis://birb-scheduler-redis:6379/0"
COLLECTOR_URL: "http://birb-latency-collector/report/" COLLECTOR_URL: "http://birb-latency-collector/report/"
RECENT_TROUBLE_TTL: "240"
--- ---
kind: ClusterRole kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1