diff --git a/.gitignore b/.gitignore index e7e375e..1c79a48 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ dist .DS_Store *.egg-info -auto_subtitle/__pycache__ build +__pycache__ \ No newline at end of file diff --git a/LICENSE b/LICENSE index cb275c7..3e586f4 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,7 @@ MIT License -Copyright (c) 2022 Miguel Piedrafita +Copyright (c) 2022-2024 Miguel Piedrafita +Copyright (c) 2024 Sergey Chernyaev Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 1d21530..8373264 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,12 @@ # Automatic subtitles in your videos -This repository uses `ffmpeg` and [OpenAI's Whisper](https://openai.com/blog/whisper) to automatically generate and overlay subtitles on any video. +This repository uses `ffmpeg` and [OpenAI's Whisper](https://openai.com/blog/whisper) ([faster-whisper](https://github.com/SYSTRAN/faster-whisper) implementation) to automatically generate and overlay subtitles on any video. ## Installation To get started, you'll need Python 3.7 or newer. Install the binary by running the following command: - pip install git+https://github.com/m1guelpf/auto-subtitle.git + pip install git+https://github.com/Sirozha1337/faster-auto-subtitle.git@dev You'll also need to install [`ffmpeg`](https://ffmpeg.org/), which is available from most package managers: @@ -25,19 +25,32 @@ choco install ffmpeg The following command will generate a `subtitled/video.mp4` file contained the input video with overlayed subtitles. - auto_subtitle /path/to/video.mp4 -o subtitled/ + faster_auto_subtitle /path/to/video.mp4 -o subtitled/ -The default setting (which selects the `small` model) works well for transcribing English. You can optionally use a bigger model for better results (especially with other languages). The available models are `tiny`, `tiny.en`, `base`, `base.en`, `small`, `small.en`, `medium`, `medium.en`, `large`. - - auto_subtitle /path/to/video.mp4 --model medium +The default setting (which selects the `small` model) works well for transcribing English. You can optionally use a bigger model for better results (especially with other languages). The available models are `tiny`, `tiny.en`, `base`, `base.en`, `small`, `small.en`, `medium`, `medium.en`, `large`, `large-v1`, `large-v2`, `large-v3`. + faster_auto_subtitle /path/to/video.mp4 --model medium Adding `--task translate` will translate the subtitles into English: - auto_subtitle /path/to/video.mp4 --task translate + faster_auto_subtitle /path/to/video.mp4 --task translate Run the following to view all available options: - auto_subtitle --help + faster_auto_subtitle --help + +## Tips + +The tool also exposes a couple of model parameters, that you can tweak to increase accuracy. + +Higher `beam_size` usually leads to greater accuraccy, but slows down the process. + +Setting higher `no_speech_threshold` could be useful for videos with a lot of background noise to stop Whisper from "hallucinating" subtitles for it. + +In my experience settings option `condition_on_previous_text` to `False` dramatically increases accurracy for videos like TV Shows with an intro song at the start. + +You can use `sample_interval` parameter to generate subtitles for a portion of the video to play around with those parameters: + + faster_auto_subtitle /path/to/video.mp4 --model medium --sample_interval 00:05:30-00:07:00 --condition_on_previous_text False --beam_size 6 --no_speech_threshold 0.7 ## License diff --git a/auto_subtitle/cli.py b/auto_subtitle/cli.py index 21cdc16..2e0eac8 100644 --- a/auto_subtitle/cli.py +++ b/auto_subtitle/cli.py @@ -1,114 +1,39 @@ -import os -import ffmpeg -import whisper import argparse -import warnings -import tempfile -from .utils import filename, str2bool, write_srt - +from faster_whisper import available_models +from .main import process +from .utils.convert import str2bool, str2timeinterval def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("video", nargs="+", type=str, help="paths to video files to transcribe") + parser.add_argument("--audio_channel", default="0", + type=int, help="audio channel index to use") + parser.add_argument("--sample_interval", type=str2timeinterval, default=None, + help="generate subtitles for a specific fragment of the video (e.g. 01:02:05-01:03:45)") parser.add_argument("--model", default="small", - choices=whisper.available_models(), help="name of the Whisper model to use") + choices=available_models(), help="name of the Whisper model to use") parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs") parser.add_argument("--output_srt", type=str2bool, default=False, help="whether to output the .srt file along with the video files") parser.add_argument("--srt_only", type=str2bool, default=False, help="only generate the .srt file and not create overlayed video") - parser.add_argument("--verbose", type=str2bool, default=False, - help="whether to print out the progress and debug messages") - + parser.add_argument("--beam_size", type=int, default=5, + help="model parameter, tweak to increase accuracy") + parser.add_argument("--no_speech_threshold", type=float, default=0.6, + help="model parameter, tweak to increase accuracy") + parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, + help="model parameter, tweak to increase accuracy") parser.add_argument("--task", type=str, default="transcribe", choices=[ "transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')") parser.add_argument("--language", type=str, default="auto", choices=["auto","af","am","ar","as","az","ba","be","bg","bn","bo","br","bs","ca","cs","cy","da","de","el","en","es","et","eu","fa","fi","fo","fr","gl","gu","ha","haw","he","hi","hr","ht","hu","hy","id","is","it","ja","jw","ka","kk","km","kn","ko","la","lb","ln","lo","lt","lv","mg","mi","mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no","oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk","sl","sn","so","sq","sr","su","sv","sw","ta","te","tg","th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo","zh"], help="What is the origin language of the video? If unset, it is detected automatically.") args = parser.parse_args().__dict__ - model_name: str = args.pop("model") - output_dir: str = args.pop("output_dir") - output_srt: bool = args.pop("output_srt") - srt_only: bool = args.pop("srt_only") - language: str = args.pop("language") - - os.makedirs(output_dir, exist_ok=True) - if model_name.endswith(".en"): - warnings.warn( - f"{model_name} is an English-only model, forcing English detection.") - args["language"] = "en" - # if translate task used and language argument is set, then use it - elif language != "auto": - args["language"] = language - - model = whisper.load_model(model_name) - audios = get_audio(args.pop("video")) - subtitles = get_subtitles( - audios, output_srt or srt_only, output_dir, lambda audio_path: model.transcribe(audio_path, **args) - ) - - if srt_only: - return - - for path, srt_path in subtitles.items(): - out_path = os.path.join(output_dir, f"{filename(path)}.mp4") - - print(f"Adding subtitles to {filename(path)}...") - - video = ffmpeg.input(path) - audio = video.audio - - ffmpeg.concat( - video.filter('subtitles', srt_path, force_style="OutlineColour=&H40000000,BorderStyle=3"), audio, v=1, a=1 - ).output(out_path).run(quiet=True, overwrite_output=True) - - print(f"Saved subtitled video to {os.path.abspath(out_path)}.") - - -def get_audio(paths): - temp_dir = tempfile.gettempdir() - - audio_paths = {} - - for path in paths: - print(f"Extracting audio from {filename(path)}...") - output_path = os.path.join(temp_dir, f"{filename(path)}.wav") - - ffmpeg.input(path).output( - output_path, - acodec="pcm_s16le", ac=1, ar="16k" - ).run(quiet=True, overwrite_output=True) - - audio_paths[path] = output_path - - return audio_paths - - -def get_subtitles(audio_paths: list, output_srt: bool, output_dir: str, transcribe: callable): - subtitles_path = {} - - for path, audio_path in audio_paths.items(): - srt_path = output_dir if output_srt else tempfile.gettempdir() - srt_path = os.path.join(srt_path, f"{filename(path)}.srt") - - print( - f"Generating subtitles for {filename(path)}... This might take a while." - ) - - warnings.filterwarnings("ignore") - result = transcribe(audio_path) - warnings.filterwarnings("default") - - with open(srt_path, "w", encoding="utf-8") as srt: - write_srt(result["segments"], file=srt) - - subtitles_path[path] = srt_path - - return subtitles_path + process(args) if __name__ == '__main__': diff --git a/auto_subtitle/main.py b/auto_subtitle/main.py new file mode 100644 index 0000000..c9a14fb --- /dev/null +++ b/auto_subtitle/main.py @@ -0,0 +1,55 @@ +import os +import warnings +import tempfile +from .utils.files import filename, write_srt +from .utils.ffmpeg import get_audio, overlay_subtitles +from .utils.whisper import WhisperAI + +def process(args: dict): + model_name: str = args.pop("model") + output_dir: str = args.pop("output_dir") + output_srt: bool = args.pop("output_srt") + srt_only: bool = args.pop("srt_only") + language: str = args.pop("language") + sample_interval: str = args.pop("sample_interval") + + os.makedirs(output_dir, exist_ok=True) + + if model_name.endswith(".en"): + warnings.warn( + f"{model_name} is an English-only model, forcing English detection.") + args["language"] = "en" + # if translate task used and language argument is set, then use it + elif language != "auto": + args["language"] = language + + audios = get_audio(args.pop("video"), args.pop('audio_channel'), sample_interval) + subtitles = get_subtitles( + audios, output_srt or srt_only, output_dir, model_name, args + ) + + if srt_only: + return + + overlay_subtitles(subtitles, output_dir, sample_interval) + +def get_subtitles(audio_paths: list, output_srt: bool, output_dir: str, model_name: str, model_args: dict): + model = WhisperAI(model_name, model_args) + + subtitles_path = {} + + for path, audio_path in audio_paths.items(): + print( + f"Generating subtitles for {filename(path)}... This might take a while." + ) + srt_path = output_dir if output_srt else tempfile.gettempdir() + srt_path = os.path.join(srt_path, f"{filename(path)}.srt") + + segments = model.transcribe(audio_path) + + with open(srt_path, "w", encoding="utf-8") as srt: + write_srt(segments, file=srt) + + subtitles_path[path] = srt_path + + return subtitles_path \ No newline at end of file diff --git a/auto_subtitle/utils.py b/auto_subtitle/utils.py deleted file mode 100644 index fb4e11a..0000000 --- a/auto_subtitle/utils.py +++ /dev/null @@ -1,46 +0,0 @@ -import os -from typing import Iterator, TextIO - - -def str2bool(string): - string = string.lower() - str2val = {"true": True, "false": False} - - if string in str2val: - return str2val[string] - else: - raise ValueError( - f"Expected one of {set(str2val.keys())}, got {string}") - - -def format_timestamp(seconds: float, always_include_hours: bool = False): - assert seconds >= 0, "non-negative timestamp expected" - milliseconds = round(seconds * 1000.0) - - hours = milliseconds // 3_600_000 - milliseconds -= hours * 3_600_000 - - minutes = milliseconds // 60_000 - milliseconds -= minutes * 60_000 - - seconds = milliseconds // 1_000 - milliseconds -= seconds * 1_000 - - hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else "" - return f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}" - - -def write_srt(transcript: Iterator[dict], file: TextIO): - for i, segment in enumerate(transcript, start=1): - print( - f"{i}\n" - f"{format_timestamp(segment['start'], always_include_hours=True)} --> " - f"{format_timestamp(segment['end'], always_include_hours=True)}\n" - f"{segment['text'].strip().replace('-->', '->')}\n", - file=file, - flush=True, - ) - - -def filename(path): - return os.path.splitext(os.path.basename(path))[0] diff --git a/auto_subtitle/utils/__init__.py b/auto_subtitle/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/auto_subtitle/utils/convert.py b/auto_subtitle/utils/convert.py new file mode 100644 index 0000000..7d14df5 --- /dev/null +++ b/auto_subtitle/utils/convert.py @@ -0,0 +1,82 @@ +from datetime import datetime, timedelta + +def str2bool(string): + string = string.lower() + str2val = {"true": True, "false": False} + + if string in str2val: + return str2val[string] + else: + raise ValueError( + f"Expected one of {set(str2val.keys())}, got {string}") + +def str2timeinterval(string): + if string is None: + return None + + if '-' not in string: + raise ValueError( + f"Expected time interval HH:mm:ss-HH:mm:ss or HH:mm-HH:mm or ss-ss, got {string}") + + intervals = string.split('-') + if len(intervals) != 2: + raise ValueError( + f"Expected time interval HH:mm:ss-HH:mm:ss or HH:mm-HH:mm or ss-ss, got {string}") + + start = try_parse_timestamp(intervals[0]) + end = try_parse_timestamp(intervals[1]) + if start >= end: + raise ValueError( + f"Expected time interval end to be higher than start, got {start} >= {end}") + + return [start, end] + +def time_to_timestamp(string): + split_time = string.split(':') + if len(split_time) == 0 or len(split_time) > 3 or not all([ x.isdigit() for x in split_time ]): + raise ValueError( + f"Expected HH:mm:ss or HH:mm or ss, got {string}") + + if len(split_time) == 1: + return int(split_time[0]) + + if len(split_time) == 2: + return int(split_time[0]) * 60 * 60 + int(split_time[1]) * 60 + + return int(split_time[0]) * 60 * 60 + int(split_time[1]) * 60 + int(split_time[2]) + +def try_parse_timestamp(string): + timestamp = parse_timestamp(string, '%H:%M:%S') + if timestamp is not None: + return timestamp + + timestamp = parse_timestamp(string, '%H:%M') + if timestamp is not None: + return timestamp + + return parse_timestamp(string, '%S') + +def parse_timestamp(string, pattern): + try: + date = datetime.strptime(string, pattern) + delta = timedelta(hours=date.hour, minutes=date.minute, seconds=date.second) + return int(delta.total_seconds()) + except: + return None + +def format_timestamp(seconds: float, always_include_hours: bool = False): + assert seconds >= 0, "non-negative timestamp expected" + milliseconds = round(seconds * 1000.0) + + hours = milliseconds // 3_600_000 + milliseconds -= hours * 3_600_000 + + minutes = milliseconds // 60_000 + milliseconds -= minutes * 60_000 + + seconds = milliseconds // 1_000 + milliseconds -= seconds * 1_000 + + hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else "" + return f"{hours_marker}{minutes:02d}:{seconds:02d},{milliseconds:03d}" + diff --git a/auto_subtitle/utils/ffmpeg.py b/auto_subtitle/utils/ffmpeg.py new file mode 100644 index 0000000..0ea7f43 --- /dev/null +++ b/auto_subtitle/utils/ffmpeg.py @@ -0,0 +1,65 @@ +import os +import ffmpeg +import tempfile +from .mytempfile import MyTempFile +from .files import filename + +def get_audio(paths: list, audio_channel_index: int, sample_interval: list): + temp_dir = tempfile.gettempdir() + + audio_paths = {} + + for path in paths: + print(f"Extracting audio from {filename(path)}...") + output_path = os.path.join(temp_dir, f"{filename(path)}.wav") + + ffmpeg_input_args = dict() + if sample_interval is not None: + ffmpeg_input_args['ss'] = str(sample_interval[0]) + + ffmpeg_output_args = dict() + ffmpeg_output_args['acodec'] = "pcm_s16le" + ffmpeg_output_args['ac'] = "1" + ffmpeg_output_args['ar'] = "16k" + ffmpeg_output_args['map'] = "0:a:" + str(audio_channel_index) + if sample_interval is not None: + ffmpeg_output_args['t'] = str(sample_interval[1] - sample_interval[0]) + + ffmpeg.input(path, **ffmpeg_input_args).output( + output_path, + **ffmpeg_output_args + ).run(quiet=True, overwrite_output=True) + + audio_paths[path] = output_path + + return audio_paths + +def escape_windows_path(path: str): + return path.replace("\\", "/").replace(":", ":").replace(" ", "\\ ").replace("(", "\\(").replace(")", "\\)").replace("[", "\\[").replace("]", "\\]").replace("'", "'\\''") + + +def overlay_subtitles(subtitles: dict, output_dir: str, sample_interval: list): + for path, srt_path in subtitles.items(): + out_path = os.path.join(output_dir, f"{filename(path)}.mp4") + + print(f"Adding subtitles to {filename(path)}...") + + ffmpeg_input_args = dict() + if sample_interval is not None: + ffmpeg_input_args['ss'] = str(sample_interval[0]) + + ffmpeg_output_args = dict() + if sample_interval is not None: + ffmpeg_output_args['t'] = str(sample_interval[1] - sample_interval[0]) + + # HACK: On Windows it's impossible to use absolute subtitle file path with ffmpeg, so we use temp copy instead + # see: https://github.com/kkroening/ffmpeg-python/issues/745 + with MyTempFile(srt_path) as srt_temp: + video = ffmpeg.input(path, **ffmpeg_input_args) + audio = video.audio + + ffmpeg.concat( + video.filter('subtitles', srt_temp.tmp_file_path, force_style="OutlineColour=&H40000000,BorderStyle=3"), audio, v=1, a=1 + ).output(out_path, **ffmpeg_output_args).run(quiet=True, overwrite_output=True) + + print(f"Saved subtitled video to {os.path.abspath(out_path)}.") \ No newline at end of file diff --git a/auto_subtitle/utils/files.py b/auto_subtitle/utils/files.py new file mode 100644 index 0000000..5caaead --- /dev/null +++ b/auto_subtitle/utils/files.py @@ -0,0 +1,17 @@ +import os +from typing import Iterator, TextIO +from .convert import format_timestamp + +def write_srt(transcript: Iterator[dict], file: TextIO): + for i, segment in enumerate(transcript, start=1): + print( + f"{i}\n" + f"{format_timestamp(segment.start, always_include_hours=True)} --> " + f"{format_timestamp(segment.end, always_include_hours=True)}\n" + f"{segment.text.strip().replace('-->', '->')}\n", + file=file, + flush=True, + ) + +def filename(path): + return os.path.splitext(os.path.basename(path))[0] diff --git a/auto_subtitle/utils/mytempfile.py b/auto_subtitle/utils/mytempfile.py new file mode 100644 index 0000000..e1dc0cf --- /dev/null +++ b/auto_subtitle/utils/mytempfile.py @@ -0,0 +1,18 @@ +import tempfile +import os +import shutil + +class MyTempFile: + def __init__(self, file_path): + self.file_path = file_path + + def __enter__(self): + self.tmp_file = tempfile.NamedTemporaryFile('w', dir='.', delete=False) + self.tmp_file_path = os.path.relpath(self.tmp_file.name, '.') + shutil.copyfile(self.file_path, self.tmp_file_path) + return self + + def __exit__(self, exc_type, exc_value, exc_traceback): + self.tmp_file.close() + if os.path.isfile(self.tmp_file_path): + os.remove(self.tmp_file_path) diff --git a/auto_subtitle/utils/whisper.py b/auto_subtitle/utils/whisper.py new file mode 100644 index 0000000..a4984e1 --- /dev/null +++ b/auto_subtitle/utils/whisper.py @@ -0,0 +1,20 @@ +import warnings +import faster_whisper +from tqdm import tqdm + +class WhisperAI: + def __init__(self, model_name, model_args): + self.model = faster_whisper.WhisperModel(model_name, device="cuda", compute_type="float16") + self.model_args = model_args + + def transcribe(self, audio_path): + warnings.filterwarnings("ignore") + segments, info = self.model.transcribe(audio_path, **self.model_args) + warnings.filterwarnings("default") + + total_duration = round(info.duration, 2) # Same precision as the Whisper timestamps. + + with tqdm(total=total_duration, unit=" seconds") as pbar: + for segment in segments: + yield segment + pbar.update(segment.end - segment.start) diff --git a/requirements.txt b/requirements.txt index 73bca28..eab95da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,3 @@ -openai-whisper +faster-whisper==0.10.0 +tqdm==4.56.0 +ffmpeg-python==0.2.0 \ No newline at end of file diff --git a/setup.py b/setup.py index ca2ed5b..c185e54 100644 --- a/setup.py +++ b/setup.py @@ -2,16 +2,18 @@ from setuptools import setup, find_packages setup( version="1.0", - name="auto_subtitle", + name="faster_auto_subtitle", packages=find_packages(), py_modules=["auto_subtitle"], - author="Miguel Piedrafita", + author="Sergey Chernyaev", install_requires=[ - 'openai-whisper', + 'faster-whisper', + 'tqdm', + 'ffmpeg-python' ], description="Automatically generate and embed subtitles into your videos", entry_points={ - 'console_scripts': ['auto_subtitle=auto_subtitle.cli:main'], + 'console_scripts': ['faster_auto_subtitle=auto_subtitle.cli:main'], }, include_package_data=True, )