This commit is contained in:
parent
3c83d9370f
commit
e7c88bacc6
@ -35,3 +35,12 @@ class Config:
|
||||
|
||||
REDIS_URL = os.environ["REDIS_URL"]
|
||||
COLLECTOR_URL = os.environ["COLLECTOR_URL"] # http://example.com/report/
|
||||
|
||||
# how many times an incraising queue must be observed for an action to be taken
|
||||
RESCHEDULE_TIRGGER_LEVEL = int(os.environ.get("RESCHEDULE_TIRGGER_LEVEL", 5))
|
||||
|
||||
# The counter clears itself after some time
|
||||
RESCHEDULE_TRIGGER_COUNTER_TTL = int(os.environ.get("RESCHEDULE_TRIGGER_COUNTER_TTL", 60))
|
||||
|
||||
# how long a mark should live on a site which had troubled recently
|
||||
RECENT_TROUBLE_TTL = int(os.environ.get("RECENT_TROUBLE_TTL", 120))
|
||||
|
@ -8,10 +8,6 @@ from k8s_buzerator import ensure_running_pod_on_site
|
||||
|
||||
from urllib.parse import urljoin
|
||||
|
||||
RESCHEDULE_TIRGGER_LEVEL = 5 # how many times an incraising queue must be observed for an action to be taken
|
||||
RESCHEDULE_TRIGGER_COUNTER_TTL = 60 # The counter clears itself after some time
|
||||
RECENT_TROUBLE_TTL = 120 # how long a mark should live on a site which had trouble recently
|
||||
|
||||
|
||||
def run(redis_client: Redis, site_url_map: Dict[str, str]):
|
||||
run_count = redis_client.incr("RUNCOUNT")
|
||||
@ -31,17 +27,16 @@ def run(redis_client: Redis, site_url_map: Dict[str, str]):
|
||||
key = f"INCRAISINGQUEUE:{site_name}"
|
||||
incraising_queue_detected_times = redis_client.incr(key)
|
||||
|
||||
if incraising_queue_detected_times > RESCHEDULE_TIRGGER_LEVEL:
|
||||
if incraising_queue_detected_times > Config.RESCHEDULE_TIRGGER_LEVEL:
|
||||
logging.debug(f"Tirgger level reached at {site_name}")
|
||||
redis_client.delete(key)
|
||||
incraising_queue_at.append(site_name)
|
||||
else:
|
||||
logging.debug(
|
||||
f"Suspicious queue size change at {site_name} ({incraising_queue_detected_times}/{RESCHEDULE_TIRGGER_LEVEL})")
|
||||
redis_client.expire(key, RESCHEDULE_TRIGGER_COUNTER_TTL) # Probably extend lifetime
|
||||
logging.debug(f"Suspicious queue size change at {site_name} ({incraising_queue_detected_times}/{Config.RESCHEDULE_TIRGGER_LEVEL})")
|
||||
redis_client.expire(key, Config.RESCHEDULE_TRIGGER_COUNTER_TTL) # Probably extend lifetime
|
||||
|
||||
# decide on default for the first time
|
||||
if run_count > RESCHEDULE_TIRGGER_LEVEL * 2:
|
||||
if run_count > Config.RESCHEDULE_TIRGGER_LEVEL * 2:
|
||||
default_site = redis_client.get("DEFAULT:SCHEDULED")
|
||||
if not default_site:
|
||||
logging.debug("Default site is not set. Selecting one...")
|
||||
@ -89,7 +84,7 @@ def run(redis_client: Redis, site_url_map: Dict[str, str]):
|
||||
|
||||
# If attention required, schedule a single workload to one tier lower
|
||||
for site_seeking_attention in incraising_queue_at:
|
||||
redis_client.set(f"RECENTTROUBLE:{site_seeking_attention}", b"\x01", ex=RECENT_TROUBLE_TTL)
|
||||
redis_client.set(f"RECENTTROUBLE:{site_seeking_attention}", b"\x01", ex=Config.RECENT_TROUBLE_TTL)
|
||||
if current_scheduling_table_counters[site_seeking_attention] == 0:
|
||||
logging.warning("Wtf? Site reporting trouble, but there are no workload scheduled to it... nothing to do")
|
||||
continue
|
||||
|
@ -36,6 +36,7 @@ data:
|
||||
DEBUG: "yes"
|
||||
REDIS_URL: "redis://birb-scheduler-redis:6379/0"
|
||||
COLLECTOR_URL: "http://birb-latency-collector/report/"
|
||||
RECENT_TROUBLE_TTL: "240"
|
||||
---
|
||||
kind: ClusterRole
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
|
Loading…
Reference in New Issue
Block a user