Skip to content

Commit

Permalink
corefile uploader: Updates per review comments offline (#3915)
Browse files Browse the repository at this point in the history
* Updates per review comments
1) core_uploader service waits for syslog.service
2) core_uploader service enabled for restart on failure
3) Use mtime instead of file size + ample time to be robust.

* Avoid reloading already uploaded file, by marking the names with a prefix.

* Updated failing path.
1) If rc file is missing or required data missing, it periodically logs error in forever loop.
2) If upload fails, retry every hour with a error log, forever.

* Fix few bugs

* The binary update_json.py will come from sonic-utilities.
  • Loading branch information
renukamanavalan authored and abdosi committed Dec 31, 2019
1 parent 7acd169 commit 2d079a1
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 77 deletions.
1 change: 0 additions & 1 deletion files/build_templates/sonic_debian_extension.j2
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,6 @@ sudo cp $IMAGE_CONFIGS/hostcfgd/*.j2 $FILESYSTEM_ROOT_USR_SHARE_SONIC_TEMPLATES/
sudo cp $IMAGE_CONFIGS/corefile_uploader/core_uploader.service $FILESYSTEM_ROOT/etc/systemd/system/
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl disable core_uploader.service
sudo cp $IMAGE_CONFIGS/corefile_uploader/core_uploader.py $FILESYSTEM_ROOT/usr/bin/
sudo cp $IMAGE_CONFIGS/corefile_uploader/update_json.py $FILESYSTEM_ROOT/usr/bin/
sudo cp $IMAGE_CONFIGS/corefile_uploader/core_analyzer.rc.json $FILESYSTEM_ROOT_ETC_SONIC/
sudo chmod og-rw $FILESYSTEM_ROOT_ETC_SONIC/core_analyzer.rc.json
sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install azure-storage
Expand Down
47 changes: 28 additions & 19 deletions files/image_config/corefile_uploader/core_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@

HOURS_4 = (4 * 60 * 60)
PAUSE_ON_FAIL = (60 * 60)
WAIT_FILE_WRITE1 = (10 * 60)
WAIT_FILE_WRITE2= (5 * 60)
POLL_SLEEP = (60 * 60)
MAX_RETRIES = 5
UPLOAD_PREFIX = "UPLOADED_"

log_level = syslog.LOG_DEBUG

Expand Down Expand Up @@ -116,7 +120,7 @@ def run(self):
self.observer.start()
try:
while True:
time.sleep(5)
time.sleep(POLL_SLEEP)
except:
self.observer.stop()
log_err("Error in watcher")
Expand Down Expand Up @@ -179,29 +183,33 @@ def on_any_event(event):
elif event.event_type == 'created':
# Take any action here when a file is first created.
log_debug("Received create event - " + event.src_path)
Handler.wait_for_file_write_complete(event.src_path)
Handler.handle_file(event.src_path)


@staticmethod
def wait_for_file_write_complete(path):
ct_size = -1
mtime = 0

while ct_size != os.path.getsize(path):
ct_size = os.path.getsize(path)
time.sleep(2)
# Sleep for ample time enough for file dump to complete.
time.sleep(WAIT_FILE_WRITE1)

time.sleep(2)
if ct_size != os.path.getsize(path):
# Give another chance & poll until mtime stabilizes
while mtime != os.stat(path).st_mtime:
mtime = os.stat(path).st_mtime
time.sleep(10)

# A safety pause for double confirmation
time.sleep(WAIT_FILE_WRITE2)
if mtime != os.stat(path).st_mtime:
raise Exception("Dump file creation is too slow: " + path)
# Give up as something is terribly wrong with this file.

log_debug("File write complete - " + path)


@staticmethod
def handle_file(path):

Handler.wait_for_file_write_complete(path)

lpath = "/".join(cwd)
make_new_dir(lpath)
os.chdir(lpath)
Expand All @@ -221,18 +229,18 @@ def handle_file(path):
tar.close()
log_debug("Tar file for upload created: " + tarf_name)

Handler.upload_file(tarf_name, tarf_name)
Handler.upload_file(tarf_name, tarf_name, path)

log_debug("File uploaded - " + path)
os.chdir(INIT_CWD)

@staticmethod
def upload_file(fname, fpath):
def upload_file(fname, fpath, coref):
daemonname = fname.split(".")[0]
i = 0
fail_msg = ""

while i <= MAX_RETRIES:
while True:
try:
svc = FileService(account_name=acctname, account_key=acctkey)

Expand All @@ -246,22 +254,23 @@ def upload_file(fname, fpath):

svc.create_file_from_path(sharename, "/".join(l), fname, fpath)
log_debug("Remote file created: name{} path{}".format(fname, fpath))
newcoref = os.path.dirname(coref) + "/" + UPLOAD_PREFIX + os.path.basename(coref)
os.rename(coref, newcoref)
break

except Exception as e:
log_err("core uploader failed: Failed during upload (" + str(e) +")")
fail_msg = str(e)
except Exception as ex:
log_err("core uploader failed: Failed during upload (" + coref + ") err: ("+ str(ex) +") retry:" + str(i))
if not os.path.exists(fpath):
break
i += 1
if i >= MAX_RETRIES:
raise Exception("Failed while uploading. msg(" + fail_msg + ") after " + str(i) + " retries")
time.sleep(PAUSE_ON_FAIL)


@staticmethod
def scan():
for e in os.listdir(CORE_FILE_PATH):
fl = CORE_FILE_PATH + e
if os.path.isfile(fl):
if os.path.isfile(fl) and not e.startswith(UPLOAD_PREFIX):
Handler.handle_file(fl)


Expand Down
6 changes: 4 additions & 2 deletions files/image_config/corefile_uploader/core_uploader.service
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
[Unit]
Description=Host core file uploader daemon
Requires=updategraph.service
After=updategraph.service
Requires=syslog.service
After=syslog.service

[Service]
Type=simple
ExecStart=/usr/bin/core_uploader.py
StandardOutput=null
Restart=on-failure

[Install]
WantedBy=multi-user.target
55 changes: 0 additions & 55 deletions files/image_config/corefile_uploader/update_json.py

This file was deleted.

0 comments on commit 2d079a1

Please sign in to comment.