This commit is contained in:
parent
3c83d9370f
commit
e7c88bacc6
@ -35,3 +35,12 @@ class Config:
|
|||||||
|
|
||||||
REDIS_URL = os.environ["REDIS_URL"]
|
REDIS_URL = os.environ["REDIS_URL"]
|
||||||
COLLECTOR_URL = os.environ["COLLECTOR_URL"] # http://example.com/report/
|
COLLECTOR_URL = os.environ["COLLECTOR_URL"] # http://example.com/report/
|
||||||
|
|
||||||
|
# how many times an incraising queue must be observed for an action to be taken
|
||||||
|
RESCHEDULE_TIRGGER_LEVEL = int(os.environ.get("RESCHEDULE_TIRGGER_LEVEL", 5))
|
||||||
|
|
||||||
|
# The counter clears itself after some time
|
||||||
|
RESCHEDULE_TRIGGER_COUNTER_TTL = int(os.environ.get("RESCHEDULE_TRIGGER_COUNTER_TTL", 60))
|
||||||
|
|
||||||
|
# how long a mark should live on a site which had troubled recently
|
||||||
|
RECENT_TROUBLE_TTL = int(os.environ.get("RECENT_TROUBLE_TTL", 120))
|
||||||
|
@ -8,10 +8,6 @@ from k8s_buzerator import ensure_running_pod_on_site
|
|||||||
|
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
RESCHEDULE_TIRGGER_LEVEL = 5 # how many times an incraising queue must be observed for an action to be taken
|
|
||||||
RESCHEDULE_TRIGGER_COUNTER_TTL = 60 # The counter clears itself after some time
|
|
||||||
RECENT_TROUBLE_TTL = 120 # how long a mark should live on a site which had trouble recently
|
|
||||||
|
|
||||||
|
|
||||||
def run(redis_client: Redis, site_url_map: Dict[str, str]):
|
def run(redis_client: Redis, site_url_map: Dict[str, str]):
|
||||||
run_count = redis_client.incr("RUNCOUNT")
|
run_count = redis_client.incr("RUNCOUNT")
|
||||||
@ -31,17 +27,16 @@ def run(redis_client: Redis, site_url_map: Dict[str, str]):
|
|||||||
key = f"INCRAISINGQUEUE:{site_name}"
|
key = f"INCRAISINGQUEUE:{site_name}"
|
||||||
incraising_queue_detected_times = redis_client.incr(key)
|
incraising_queue_detected_times = redis_client.incr(key)
|
||||||
|
|
||||||
if incraising_queue_detected_times > RESCHEDULE_TIRGGER_LEVEL:
|
if incraising_queue_detected_times > Config.RESCHEDULE_TIRGGER_LEVEL:
|
||||||
logging.debug(f"Tirgger level reached at {site_name}")
|
logging.debug(f"Tirgger level reached at {site_name}")
|
||||||
redis_client.delete(key)
|
redis_client.delete(key)
|
||||||
incraising_queue_at.append(site_name)
|
incraising_queue_at.append(site_name)
|
||||||
else:
|
else:
|
||||||
logging.debug(
|
logging.debug(f"Suspicious queue size change at {site_name} ({incraising_queue_detected_times}/{Config.RESCHEDULE_TIRGGER_LEVEL})")
|
||||||
f"Suspicious queue size change at {site_name} ({incraising_queue_detected_times}/{RESCHEDULE_TIRGGER_LEVEL})")
|
redis_client.expire(key, Config.RESCHEDULE_TRIGGER_COUNTER_TTL) # Probably extend lifetime
|
||||||
redis_client.expire(key, RESCHEDULE_TRIGGER_COUNTER_TTL) # Probably extend lifetime
|
|
||||||
|
|
||||||
# decide on default for the first time
|
# decide on default for the first time
|
||||||
if run_count > RESCHEDULE_TIRGGER_LEVEL * 2:
|
if run_count > Config.RESCHEDULE_TIRGGER_LEVEL * 2:
|
||||||
default_site = redis_client.get("DEFAULT:SCHEDULED")
|
default_site = redis_client.get("DEFAULT:SCHEDULED")
|
||||||
if not default_site:
|
if not default_site:
|
||||||
logging.debug("Default site is not set. Selecting one...")
|
logging.debug("Default site is not set. Selecting one...")
|
||||||
@ -89,7 +84,7 @@ def run(redis_client: Redis, site_url_map: Dict[str, str]):
|
|||||||
|
|
||||||
# If attention required, schedule a single workload to one tier lower
|
# If attention required, schedule a single workload to one tier lower
|
||||||
for site_seeking_attention in incraising_queue_at:
|
for site_seeking_attention in incraising_queue_at:
|
||||||
redis_client.set(f"RECENTTROUBLE:{site_seeking_attention}", b"\x01", ex=RECENT_TROUBLE_TTL)
|
redis_client.set(f"RECENTTROUBLE:{site_seeking_attention}", b"\x01", ex=Config.RECENT_TROUBLE_TTL)
|
||||||
if current_scheduling_table_counters[site_seeking_attention] == 0:
|
if current_scheduling_table_counters[site_seeking_attention] == 0:
|
||||||
logging.warning("Wtf? Site reporting trouble, but there are no workload scheduled to it... nothing to do")
|
logging.warning("Wtf? Site reporting trouble, but there are no workload scheduled to it... nothing to do")
|
||||||
continue
|
continue
|
||||||
|
@ -36,6 +36,7 @@ data:
|
|||||||
DEBUG: "yes"
|
DEBUG: "yes"
|
||||||
REDIS_URL: "redis://birb-scheduler-redis:6379/0"
|
REDIS_URL: "redis://birb-scheduler-redis:6379/0"
|
||||||
COLLECTOR_URL: "http://birb-latency-collector/report/"
|
COLLECTOR_URL: "http://birb-latency-collector/report/"
|
||||||
|
RECENT_TROUBLE_TTL: "240"
|
||||||
---
|
---
|
||||||
kind: ClusterRole
|
kind: ClusterRole
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
Loading…
Reference in New Issue
Block a user