Skip to content

Commit

Permalink
Allow custom auto-resize crawler volume ratio adjustable (#2076)
Browse files Browse the repository at this point in the history
Make the avail / used storage ratio (for crawler volumes) adjustable.
Disable auto-resize if set to 0.
Follow-up to #2023
  • Loading branch information
ikreymer committed Sep 12, 2024
1 parent 49ce894 commit 1f919de
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 7 deletions.
17 changes: 12 additions & 5 deletions backend/btrixcloud/operator/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,6 @@
# set memory limit to this much of request for extra padding
MEM_LIMIT_PADDING = 1.2

# ensure available storage is at least this much times used storage
AVAIL_STORAGE_RATIO = 2.5


# pylint: disable=too-many-public-methods, too-many-locals, too-many-branches, too-many-statements
# pylint: disable=invalid-name, too-many-lines, too-many-return-statements
Expand All @@ -93,6 +90,8 @@ class CrawlOperator(BaseOperator):
fast_retry_secs: int
log_failed_crawl_lines: int

min_avail_storage_ratio: float

def __init__(self, *args):
super().__init__(*args)

Expand All @@ -104,6 +103,11 @@ def __init__(self, *args):

self.log_failed_crawl_lines = int(os.environ.get("LOG_FAILED_CRAWL_LINES") or 0)

# ensure available storage is at least this much times used storage
self.min_avail_storage_ratio = float(
os.environ.get("CRAWLER_MIN_AVAIL_STORAGE_RATIO") or 0
)

def init_routes(self, app):
"""init routes for this operator"""

Expand Down Expand Up @@ -1336,12 +1340,15 @@ async def update_crawl_state(

if (
status.state == "running"
and self.min_avail_storage_ratio
and pod_info.allocated.storage
and pod_info.used.storage * AVAIL_STORAGE_RATIO
and pod_info.used.storage * self.min_avail_storage_ratio
> pod_info.allocated.storage
):
new_storage = math.ceil(
pod_info.used.storage * AVAIL_STORAGE_RATIO / 1_000_000_000
pod_info.used.storage
* self.min_avail_storage_ratio
/ 1_000_000_000
)
pod_info.newStorage = f"{new_storage}Gi"
print(
Expand Down
2 changes: 2 additions & 0 deletions chart/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ data:

MAX_CRAWLER_MEMORY: "{{ .Values.max_crawler_memory }}"

CRAWLER_MIN_AVAIL_STORAGE_RATIO: "{{ .Values.crawler_min_avail_storage_ratio }}"

ENABLE_AUTO_RESIZE_CRAWLERS: "{{ .Values.enable_auto_resize_crawlers }}"

BILLING_ENABLED: "{{ .Values.billing_enabled }}"
Expand Down
11 changes: 9 additions & 2 deletions chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ allow_dupe_invites: "0"
invite_expire_seconds: 604800

# base url for replayweb.page
rwp_base_url: "https://cdn.jsdelivr.net/npm/replaywebpage@1.8.15/"
rwp_base_url: "https://cdn.jsdelivr.net/npm/replaywebpage@2.1.4/"

superuser:
# set this to enable a superuser admin
Expand Down Expand Up @@ -288,12 +288,19 @@ enable_auto_resize_crawlers: false
# the workdir is used to store the browser profile data and other temporary files
# profile_browser_workdir_size: 4Gi


# Other Crawler Settings
# ----------------------

# minimum size allocated to each crawler
# should be at least double crawl session size to ensure space for WACZ and browser profile data
crawler_storage: "26Gi"
crawler_storage: "25Gi"


# if set, will ensure 'crawler_storage' is at least this times used storage
# eg. if crawler session reaches 10Gb, and this value is 2.5, will attempt
# to resize to at least 25Gb.
crawler_min_avail_storage_ratio: 2.5

# max size at which crawler will commit current crawl session
crawler_session_size_limit_bytes: "10000000000"
Expand Down

0 comments on commit 1f919de

Please sign in to comment.