Fix agermanidis#67 high memory/RAM usage partially by terminating chi…

…ld processes, lowering default multiprocessing counts and using gc.collect(0).
joelvaneenwyk · Feb 3, 2020 · f41c846 · f41c846
1 parent d9a474b
commit f41c846
Show file tree

Hide file tree

Showing 16 changed files with 178 additions and 127 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## TOC
 
 - [Unreleased](#unreleased)
+  - 
 - [0.5.4-alpha - 2020-01-31](#054-alpha---2020-01-31)
   - [Added](#added054-alpha)
   - [Changed](#changed054-alpha)
@@ -34,6 +35,10 @@ Click up arrow to go back to TOC.
 
 ### Unreleased
 
+#### Changed(Unreleased)
+
+- Fix high memory/RAM usage partially by terminating child processes, lowering default multiprocessing counts and using `gc.collect(0)`. [issue #67](https://github.com/BingLingGroup/autosub/issues/67), [issue #74](https://github.com/BingLingGroup/autosub/issues/74)
+
 ### [0.5.4-alpha] - 2020-01-31
 
 #### Added(0.5.4-alpha)
@@ -46,6 +51,8 @@ Click up arrow to go back to TOC.
 - Fix output format limits when input is a subtitles file.
 - Remove gtransv2 support.
 
+<escape><a href = "#TOC">&nbsp;↑&nbsp;</a></escape>
+
 ### [0.5.3-alpha] - 2019-12-30
 
 #### Changed(0.5.3-alpha)
@@ -54,6 +61,8 @@ Click up arrow to go back to TOC.
 - Fix Auditok option issues. [issue #70](https://github.com/BingLingGroup/autosub/issues/70)
 - Fix output option issue. [issue #73](https://github.com/BingLingGroup/autosub/issues/73)
 
+<escape><a href = "#TOC">&nbsp;↑&nbsp;</a></escape>
+
 ### [0.5.2-alpha] - 2019-11-05
 
 #### Added(0.5.2-alpha)

diff --git a/autosub/__init__.py b/autosub/__init__.py
@@ -65,16 +65,13 @@ def main():  # pylint: disable=too-many-branches, too-many-statements, too-many-
         validate_result = cmdline_utils.validate_io(args, styles_list)
 
         if validate_result == 0:
-            ffmpeg_cmd = ffmpeg_utils.get_cmd("ffmpeg")
+            ffmpeg_cmd = "{} ".format(ffmpeg_utils.get_cmd("ffmpeg"))
             if not ffmpeg_cmd:
                 raise exceptions.AutosubException(
                     _("Error: Dependency ffmpeg"
                       " not found on this machine."))
 
-            ffmpeg_cmd = ffmpeg_cmd + ' '
-
-            cmdline_utils.fix_args(args,
-                                   ffmpeg_cmd=ffmpeg_cmd)
+            cmdline_utils.fix_args(args, ffmpeg_cmd=ffmpeg_cmd)
 
             if args.audio_process:
                 args.audio_process = {k.lower() for k in args.audio_process}

diff --git a/autosub/cmdline_utils.py b/autosub/cmdline_utils.py
@@ -11,6 +11,7 @@
 import os
 import subprocess
 import tempfile
+import gc
 
 # Import third-party modules
 import auditok
@@ -865,7 +866,7 @@ def audio_or_video_prcs(  # pylint: disable=too-many-branches, too-many-statemen
             channel=1,
             sample_rate=48000,
             out_=audio_wav)
-        print(_("\nConvert source audio to \"{name}\" "
+        print(_("\nConvert source file to \"{name}\" "
                 "to get audio length and detect audio regions.").format(
                     name=audio_wav))
         print(command)
@@ -875,9 +876,11 @@ def audio_or_video_prcs(  # pylint: disable=too-many-branches, too-many-statemen
 
         if not ffmpeg_utils.ffprobe_check_file(audio_wav):
             raise exceptions.AutosubException(
-                _("Error: Convert source audio to \"{name}\" failed.").format(
+                _("Error: Convert source file to \"{name}\" failed.").format(
                     name=audio_wav))
 
+        print(_("Detecting speech regions using Auditok."))
+
         regions = core.auditok_gen_speech_regions(
             audio_wav=audio_wav,
             energy_threshold=args.energy_threshold,
@@ -886,6 +889,8 @@ def audio_or_video_prcs(  # pylint: disable=too-many-branches, too-many-statemen
             max_continuous_silence=args.max_continuous_silence,
             mode=mode)
         os.remove(audio_wav)
+        gc.collect(0)
+
         print(_("\n\"{name}\" has been deleted.").format(name=audio_wav))
 
     if not regions:
@@ -935,6 +940,7 @@ def audio_or_video_prcs(  # pylint: disable=too-many-branches, too-many-statemen
             suffix=args.api_suffix,
             concurrency=args.audio_concurrency,
             is_keep=args.keep)
+        gc.collect(0)
 
         if not audio_fragments or \
                 len(audio_fragments) != len(regions):
@@ -981,6 +987,7 @@ def audio_or_video_prcs(  # pylint: disable=too-many-branches, too-many-statemen
                 concurrency=args.speech_concurrency,
                 min_confidence=args.min_confidence,
                 is_keep=args.keep)
+            gc.collect(0)
 
         elif args.speech_api == "gcsv1":
             # Google Cloud speech-to-text V1P1Beta1
@@ -1034,6 +1041,7 @@ def audio_or_video_prcs(  # pylint: disable=too-many-branches, too-many-statemen
         else:
             text_list = None
 
+        gc.collect(0)
         if not text_list or len(text_list) != len(regions):
             raise exceptions.SpeechToTextException(
                 _("Error: Speech-to-text failed.\nAll works done."))

diff --git a/autosub/constants.py b/autosub/constants.py
@@ -9,6 +9,7 @@
 import sys
 import shlex
 import locale
+import multiprocessing
 
 # Import third-party modules
 
@@ -52,7 +53,12 @@
 GOOGLE_SPEECH_V2_API_KEY = "AIzaSyBOti4mM-6x9WDnZIjIeyEU21OpBXqWBgw"
 GOOGLE_SPEECH_V2_API_URL = \
     "www.google.com/speech-api/v2/recognize?client=chromium&lang={lang}&key={key}"
-DEFAULT_CONCURRENCY = 10
+
+if multiprocessing.cpu_count() > 3:
+    DEFAULT_CONCURRENCY = multiprocessing.cpu_count() >> 1
+else:
+    DEFAULT_CONCURRENCY = 2
+
 DEFAULT_SRC_LANGUAGE = 'en-US'
 DEFAULT_ENERGY_THRESHOLD = 45
 MAX_REGION_SIZE = 6.0

diff --git a/autosub/core.py b/autosub/core.py
@@ -9,6 +9,7 @@
 import multiprocessing
 import time
 import gettext
+import gc
 
 # Import third-party modules
 import progressbar
@@ -66,6 +67,7 @@ def auditok_gen_speech_regions(  # pylint: disable=too-many-arguments
     for token in tokens:
         # get start and end times
         regions.append((token[1] * 10, token[2] * 10))
+
     asource.close()
     # reference
     # auditok.readthedocs.io/en/latest/apitutorial.html#examples-using-real-audio-data
@@ -108,14 +110,16 @@ def bulk_audio_conversion(  # pylint: disable=too-many-arguments
         for i, flac_region in enumerate(pool.imap(converter, regions)):
             audio_fragments.append(flac_region)
             pbar.update(i)
+            gc.collect(0)
         pbar.finish()
+        pool.terminate()
+        pool.join()
 
     except KeyboardInterrupt:
         pbar.finish()
         pool.terminate()
         pool.join()
         return None
-
     return audio_fragments
 
 
@@ -155,8 +159,11 @@ def gsv2_to_text(  # pylint: disable=too-many-locals,too-many-arguments,too-many
                 text_list.append(transcript)
             else:
                 text_list.append("")
+            gc.collect(0)
             pbar.update(i)
         pbar.finish()
+        pool.terminate()
+        pool.join()
 
     except (KeyboardInterrupt, AttributeError) as error:
         pbar.finish()
@@ -274,6 +281,7 @@ def gcsv1_to_text(  # pylint: disable=too-many-locals,too-many-arguments,too-man
                 tasks.append(pool.apply_async(
                     speech_trans_api.gcsv1p1beta1_service_client,
                     args=(filename, is_keep, config, min_confidence)))
+                gc.collect(0)
 
             for task in tasks:
                 i = i + 1
@@ -285,6 +293,8 @@ def gcsv1_to_text(  # pylint: disable=too-many-locals,too-many-arguments,too-man
                 pbar.update(i)
 
         pbar.finish()
+        pool.terminate()
+        pool.join()
 
     except (KeyboardInterrupt, AttributeError) as error:
         pbar.finish()
@@ -508,7 +518,7 @@ def list_to_sub_str(
     else:
         # fallback process
         print(_("Format \"{fmt}\" not supported. "
-                "Using \"{default_fmt}\" instead.").format(
+                "Use \"{default_fmt}\" instead.").format(
                     fmt=subtitles_file_format,
                     default_fmt=constants.DEFAULT_SUBTITLES_FORMAT))
         pysubs2_obj = pysubs2.SSAFile()
@@ -567,7 +577,7 @@ def ssafile_to_sub_str(
     else:
         # fallback process
         print(_("Format \"{fmt}\" not supported. "
-                "Using \"{default_fmt}\" instead.").format(
+                "Use \"{default_fmt}\" instead.").format(
                     fmt=subtitles_file_format,
                     default_fmt=constants.DEFAULT_SUBTITLES_FORMAT))
         formatted_subtitles = ssafile.to_string(

diff --git a/autosub/data/locale/zh_CN/LC_MESSAGES/autosub.cmdline_utils.mo b/autosub/data/locale/zh_CN/LC_MESSAGES/autosub.cmdline_utils.mo