From eeed02cbb371c94d6609280bf47dc44be278b4b0 Mon Sep 17 00:00:00 2001 From: chiprodoy Date: Wed, 14 Jun 2023 05:33:04 +0700 Subject: [PATCH] time format fixes in generated .srt files --- auto_subtitle/utils.py | 4 +- build/lib/auto_subtitle/__init__.py | 0 build/lib/auto_subtitle/cli.py | 115 ++++++++++++++++++++++++++++ build/lib/auto_subtitle/utils.py | 46 +++++++++++ 4 files changed, 163 insertions(+), 2 deletions(-) create mode 100644 build/lib/auto_subtitle/__init__.py create mode 100644 build/lib/auto_subtitle/cli.py create mode 100644 build/lib/auto_subtitle/utils.py diff --git a/auto_subtitle/utils.py b/auto_subtitle/utils.py index ee5515b..fb4e11a 100644 --- a/auto_subtitle/utils.py +++ b/auto_subtitle/utils.py @@ -26,8 +26,8 @@ def format_timestamp(seconds: float, always_include_hours: bool = False): seconds = milliseconds // 1_000 milliseconds -= seconds * 1_000 - hours_marker = f"{hours}:" if always_include_hours or hours > 0 else "" - return f"{hours_marker}{minutes:02d}:{seconds:02d}.{milliseconds:03d}" + hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else "" + return f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}" def write_srt(transcript: Iterator[dict], file: TextIO): diff --git a/build/lib/auto_subtitle/__init__.py b/build/lib/auto_subtitle/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build/lib/auto_subtitle/cli.py b/build/lib/auto_subtitle/cli.py new file mode 100644 index 0000000..21cdc16 --- /dev/null +++ b/build/lib/auto_subtitle/cli.py @@ -0,0 +1,115 @@ +import os +import ffmpeg +import whisper +import argparse +import warnings +import tempfile +from .utils import filename, str2bool, write_srt + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("video", nargs="+", type=str, + help="paths to video files to transcribe") + parser.add_argument("--model", default="small", + choices=whisper.available_models(), help="name of the Whisper model to use") + parser.add_argument("--output_dir", "-o", type=str, + default=".", help="directory to save the outputs") + parser.add_argument("--output_srt", type=str2bool, default=False, + help="whether to output the .srt file along with the video files") + parser.add_argument("--srt_only", type=str2bool, default=False, + help="only generate the .srt file and not create overlayed video") + parser.add_argument("--verbose", type=str2bool, default=False, + help="whether to print out the progress and debug messages") + + parser.add_argument("--task", type=str, default="transcribe", choices=[ + "transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')") + parser.add_argument("--language", type=str, default="auto", choices=["auto","af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca","cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr","gl","gu","ha","haw","he","hi","hr","ht","hu","hy","id","is","it","ja","jw","ka","kk","km","kn","ko","la","lb","ln","lo","lt","lv","mg","mi","mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no","oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl","sn","so","sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo","zh"], + help="What is the origin language of the video? If unset, it is detected automatically.") + + args = parser.parse_args().__dict__ + model_name: str = args.pop("model") + output_dir: str = args.pop("output_dir") + output_srt: bool = args.pop("output_srt") + srt_only: bool = args.pop("srt_only") + language: str = args.pop("language") + + os.makedirs(output_dir, exist_ok=True) + + if model_name.endswith(".en"): + warnings.warn( + f"{model_name} is an English-only model, forcing English detection.") + args["language"] = "en" + # if translate task used and language argument is set, then use it + elif language != "auto": + args["language"] = language + + model = whisper.load_model(model_name) + audios = get_audio(args.pop("video")) + subtitles = get_subtitles( + audios, output_srt or srt_only, output_dir, lambda audio_path: model.transcribe(audio_path, **args) + ) + + if srt_only: + return + + for path, srt_path in subtitles.items(): + out_path = os.path.join(output_dir, f"{filename(path)}.mp4") + + print(f"Adding subtitles to {filename(path)}...") + + video = ffmpeg.input(path) + audio = video.audio + + ffmpeg.concat( + video.filter('subtitles', srt_path, force_style="OutlineColour=&H40000000,BorderStyle=3"), audio, v=1, a=1 + ).output(out_path).run(quiet=True, overwrite_output=True) + + print(f"Saved subtitled video to {os.path.abspath(out_path)}.") + + +def get_audio(paths): + temp_dir = tempfile.gettempdir() + + audio_paths = {} + + for path in paths: + print(f"Extracting audio from {filename(path)}...") + output_path = os.path.join(temp_dir, f"{filename(path)}.wav") + + ffmpeg.input(path).output( + output_path, + acodec="pcm_s16le", ac=1, ar="16k" + ).run(quiet=True, overwrite_output=True) + + audio_paths[path] = output_path + + return audio_paths + + +def get_subtitles(audio_paths: list, output_srt: bool, output_dir: str, transcribe: callable): + subtitles_path = {} + + for path, audio_path in audio_paths.items(): + srt_path = output_dir if output_srt else tempfile.gettempdir() + srt_path = os.path.join(srt_path, f"{filename(path)}.srt") + + print( + f"Generating subtitles for {filename(path)}... This might take a while." + ) + + warnings.filterwarnings("ignore") + result = transcribe(audio_path) + warnings.filterwarnings("default") + + with open(srt_path, "w", encoding="utf-8") as srt: + write_srt(result["segments"], file=srt) + + subtitles_path[path] = srt_path + + return subtitles_path + + +if __name__ == '__main__': + main() diff --git a/build/lib/auto_subtitle/utils.py b/build/lib/auto_subtitle/utils.py new file mode 100644 index 0000000..fb4e11a --- /dev/null +++ b/build/lib/auto_subtitle/utils.py @@ -0,0 +1,46 @@ +import os +from typing import Iterator, TextIO + + +def str2bool(string): + string = string.lower() + str2val = {"true": True, "false": False} + + if string in str2val: + return str2val[string] + else: + raise ValueError( + f"Expected one of {set(str2val.keys())}, got {string}") + + +def format_timestamp(seconds: float, always_include_hours: bool = False): + assert seconds >= 0, "non-negative timestamp expected" + milliseconds = round(seconds * 1000.0) + + hours = milliseconds // 3_600_000 + milliseconds -= hours * 3_600_000 + + minutes = milliseconds // 60_000 + milliseconds -= minutes * 60_000 + + seconds = milliseconds // 1_000 + milliseconds -= seconds * 1_000 + + hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else "" + return f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}" + + +def write_srt(transcript: Iterator[dict], file: TextIO): + for i, segment in enumerate(transcript, start=1): + print( + f"{i}\n" + f"{format_timestamp(segment['start'], always_include_hours=True)} --> " + f"{format_timestamp(segment['end'], always_include_hours=True)}\n" + f"{segment['text'].strip().replace('-->', '->')}\n", + file=file, + flush=True, + ) + + +def filename(path): + return os.path.splitext(os.path.basename(path))[0]