diff --git a/.vscode/launch.json b/.vscode/launch.json index ccbc084..f9576a1 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -6,19 +6,31 @@ "configurations": [ { "name": "Python Debugger: Current File", - "type": "debugpy", + "type": "python", "request": "launch", "program": "${file}", "console": "integratedTerminal", "justMyCode": false, "env": { - "CUDA_VISIBLE_DEVICES": "1" + "CUDA_VISIBLE_DEVICES": "1", + "LD_LIBRARY_PATH": "/home/karl/faster-auto-subtitle/venv/lib/python3.11/site-packages/nvidia/cublas/lib:/home/karl/faster-auto-subtitle/venv/lib/python3.11/site-packages/nvidia/cudnn/lib" }, "args": [ "--model", - "base", - "--show", - "Gary Neville's Soccerbox" + "base" + ] + }, + { + "name": "Current (withenv)", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/run_with_env.sh", + "console": "integratedTerminal", + "justMyCode": false, + "args": [ + "${file}", + "--model", + "base" ] } ] diff --git a/bazarr-ai-sub-generator/cli.py b/bazarr-ai-sub-generator/cli.py index ad22bf6..f788796 100644 --- a/bazarr-ai-sub-generator/cli.py +++ b/bazarr-ai-sub-generator/cli.py @@ -55,24 +55,6 @@ def main(): # help="Type to use for computation. \ # See https://opennmt.net/CTranslate2/quantization.html.", # ) - # parser.add_argument( - # "--beam_size", - # type=int, - # default=5, - # help="model parameter, tweak to increase accuracy", - # ) - # parser.add_argument( - # "--no_speech_threshold", - # type=float, - # default=0.6, - # help="model parameter, tweak to increase accuracy", - # ) - # parser.add_argument( - # "--condition_on_previous_text", - # type=str2bool, - # default=True, - # help="model parameter, tweak to increase accuracy", - # ) parser.add_argument( "--show", type=str, diff --git a/bazarr-ai-sub-generator/main.py b/bazarr-ai-sub-generator/main.py index 76d009b..c38fa16 100644 --- a/bazarr-ai-sub-generator/main.py +++ b/bazarr-ai-sub-generator/main.py @@ -6,6 +6,7 @@ from utils.files import filename, write_srt from utils.ffmpeg import get_audio, add_subtitles_to_mp4 from utils.bazarr import get_wanted_episodes, get_episode_details, sync_series from utils.sonarr import update_show_in_sonarr +# from utils.faster_whisper import WhisperAI from utils.whisper import WhisperAI from utils.decorator import measure_time @@ -34,13 +35,16 @@ def process(args: dict): for episode in list_of_episodes_needing_subtitles["data"]: print(f"Processing {episode['seriesTitle']} - {episode['episode_number']}") episode_data = get_episode_details(episode["sonarrEpisodeId"]) - audios = get_audio([episode_data["path"]], 0, None) - subtitles = get_subtitles(audios, tempfile.gettempdir(), model_args, args) + try: + audios = get_audio([episode_data["path"]], 0, None) + subtitles = get_subtitles(audios, tempfile.gettempdir(), model_args, args) - add_subtitles_to_mp4(subtitles) - update_show_in_sonarr(episode["sonarrSeriesId"]) - time.sleep(5) - sync_series() + add_subtitles_to_mp4(subtitles) + update_show_in_sonarr(episode["sonarrSeriesId"]) + time.sleep(5) + sync_series() + except Exception as ex: + print(f"skipping file due to - {ex}") @measure_time def get_subtitles( diff --git a/bazarr-ai-sub-generator/utils/faster_whisper.py b/bazarr-ai-sub-generator/utils/faster_whisper.py new file mode 100644 index 0000000..a9700a8 --- /dev/null +++ b/bazarr-ai-sub-generator/utils/faster_whisper.py @@ -0,0 +1,68 @@ +import warnings +import faster_whisper +from tqdm import tqdm + + +# pylint: disable=R0903 +class WhisperAI: + """ + Wrapper class for the Whisper speech recognition model with additional functionality. + + This class provides a high-level interface for transcribing audio files using the Whisper + speech recognition model. It encapsulates the model instantiation and transcription process, + allowing users to easily transcribe audio files and iterate over the resulting segments. + + Usage: + ```python + whisper = WhisperAI(model_args, transcribe_args) + + # Transcribe an audio file and iterate over the segments + for segment in whisper.transcribe(audio_path): + # Process each transcription segment + print(segment) + ``` + + Args: + - model_args: Arguments to pass to WhisperModel initialize method + - model_size_or_path (str): The name of the Whisper model to use. + - device (str): The device to use for computation ("cpu", "cuda", "auto"). + - compute_type (str): The type to use for computation. + See https://opennmt.net/CTranslate2/quantization.html. + - transcribe_args (dict): Additional arguments to pass to the transcribe method. + + Attributes: + - model (faster_whisper.WhisperModel): The underlying Whisper speech recognition model. + - transcribe_args (dict): Additional arguments used for transcribe method. + + Methods: + - transcribe(audio_path): Transcribes an audio file and yields the resulting segments. + """ + + def __init__(self, model_args: dict, transcribe_args: dict): + # self.model = faster_whisper.WhisperModel(**model_args) + model_size = "base" + self.model = faster_whisper.WhisperModel(model_size, device="cuda") + self.transcribe_args = transcribe_args + + def transcribe(self, audio_path: str): + """ + Transcribes the specified audio file and yields the resulting segments. + + Args: + - audio_path (str): The path to the audio file for transcription. + + Yields: + - faster_whisper.TranscriptionSegment: An individual transcription segment. + """ + warnings.filterwarnings("ignore") + segments, info = self.model.transcribe(audio_path, beam_size=5) + warnings.filterwarnings("default") + + # Same precision as the Whisper timestamps. + total_duration = round(info.duration, 2) + + with tqdm(total=total_duration, unit=" seconds") as pbar: + for segment in segments: + yield segment + pbar.update(segment.end - segment.start) + pbar.update(0) \ No newline at end of file diff --git a/bazarr-ai-sub-generator/utils/whisper.py b/bazarr-ai-sub-generator/utils/whisper.py index 6db019c..3f2fc9f 100644 --- a/bazarr-ai-sub-generator/utils/whisper.py +++ b/bazarr-ai-sub-generator/utils/whisper.py @@ -50,7 +50,7 @@ class WhisperAI: # Set device for computation self.device = torch.device(device) # Load the Whisper model with the specified size - self.model = whisper.load_model("base").to(self.device) + self.model = whisper.load_model("base.en").to(self.device) # Store the additional transcription arguments self.transcribe_args = transcribe_args diff --git a/requirements.txt b/requirements.txt index 755a1a7..9582f9b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,9 @@ tqdm==4.56.0 ffmpeg-python==0.2.0 git+https://github.com/openai/whisper.git +faster-whisper +nvidia-cublas-cu12 +nvidia-cudnn-cu12 +nvidia-cublas-cu11 +nvidia-cudnn-cu11 +ctranslate2==3.24.0 \ No newline at end of file