more updates, working with whisper and faster-whipser

2025-07-09 07:46:06 +01:00 · 2024-07-16 07:31:28 +00:00 · 2024-07-16 07:31:28 +00:00 · 966fa639c4
Commit 966fa639c4
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -6,19 +6,31 @@
    "configurations": [
        {
            "name": "Python Debugger: Current File",
-            "type": "debugpy",
+            "type": "python",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal",
            "justMyCode": false,
            "env": {
-                "CUDA_VISIBLE_DEVICES": "1"
+                "CUDA_VISIBLE_DEVICES": "1",
+                "LD_LIBRARY_PATH": "/home/karl/faster-auto-subtitle/venv/lib/python3.11/site-packages/nvidia/cublas/lib:/home/karl/faster-auto-subtitle/venv/lib/python3.11/site-packages/nvidia/cudnn/lib"
            },
            "args": [
                "--model",
-                "base",
-                "--show",
-                "Gary Neville's Soccerbox"
+                "base"
+            ]
+        },
+        {
+            "name": "Current (withenv)",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${workspaceFolder}/run_with_env.sh",
+            "console": "integratedTerminal",
+            "justMyCode": false,
+            "args": [
+                "${file}",
+                "--model",
+                "base"
            ]
        }
    ]
--- a/bazarr-ai-sub-generator/cli.py
+++ b/bazarr-ai-sub-generator/cli.py
@ -55,24 +55,6 @@ def main():
    #     help="Type to use for computation. \
    #                           See https://opennmt.net/CTranslate2/quantization.html.",
    # )
-    # parser.add_argument(
-    #     "--beam_size",
-    #     type=int,
-    #     default=5,
-    #     help="model parameter, tweak to increase accuracy",
-    # )
-    # parser.add_argument(
-    #     "--no_speech_threshold",
-    #     type=float,
-    #     default=0.6,
-    #     help="model parameter, tweak to increase accuracy",
-    # )
-    # parser.add_argument(
-    #     "--condition_on_previous_text",
-    #     type=str2bool,
-    #     default=True,
-    #     help="model parameter, tweak to increase accuracy",
-    # )
    parser.add_argument(
        "--show",
        type=str,
--- a/bazarr-ai-sub-generator/main.py
+++ b/bazarr-ai-sub-generator/main.py
@ -6,6 +6,7 @@ from utils.files import filename, write_srt
 from utils.ffmpeg import get_audio, add_subtitles_to_mp4
 from utils.bazarr import get_wanted_episodes, get_episode_details, sync_series
 from utils.sonarr import update_show_in_sonarr
+# from utils.faster_whisper import WhisperAI
 from utils.whisper import WhisperAI
 from utils.decorator import measure_time

@ -34,13 +35,16 @@ def process(args: dict):
    for episode in list_of_episodes_needing_subtitles["data"]:
        print(f"Processing {episode['seriesTitle']} - {episode['episode_number']}")
        episode_data = get_episode_details(episode["sonarrEpisodeId"])
-        audios = get_audio([episode_data["path"]], 0, None)
-        subtitles = get_subtitles(audios, tempfile.gettempdir(), model_args, args)
+        try:
+            audios = get_audio([episode_data["path"]], 0, None)
+            subtitles = get_subtitles(audios, tempfile.gettempdir(), model_args, args)

-        add_subtitles_to_mp4(subtitles)
-        update_show_in_sonarr(episode["sonarrSeriesId"])
-        time.sleep(5)
-        sync_series()
+            add_subtitles_to_mp4(subtitles)
+            update_show_in_sonarr(episode["sonarrSeriesId"])
+            time.sleep(5)
+            sync_series()
+        except Exception as ex:
+            print(f"skipping file due to - {ex}")

@measure_time
 def get_subtitles(
--- a/bazarr-ai-sub-generator/utils/faster_whisper.py
+++ b/bazarr-ai-sub-generator/utils/faster_whisper.py
@ -0,0 +1,68 @@
+import warnings
+import faster_whisper
+from tqdm import tqdm
+
+
+# pylint: disable=R0903
+class WhisperAI:
+    """
+    Wrapper class for the Whisper speech recognition model with additional functionality.
+
+    This class provides a high-level interface for transcribing audio files using the Whisper
+    speech recognition model. It encapsulates the model instantiation and transcription process,
+    allowing users to easily transcribe audio files and iterate over the resulting segments.
+
+    Usage:
+    ```python
+    whisper = WhisperAI(model_args, transcribe_args)
+
+    # Transcribe an audio file and iterate over the segments
+    for segment in whisper.transcribe(audio_path):
+        # Process each transcription segment
+        print(segment)
+    ```
+
+    Args:
+    - model_args: Arguments to pass to WhisperModel initialize method
+        - model_size_or_path (str): The name of the Whisper model to use.
+        - device (str): The device to use for computation ("cpu", "cuda", "auto").
+        - compute_type (str): The type to use for computation.
+            See https://opennmt.net/CTranslate2/quantization.html.
+    - transcribe_args (dict): Additional arguments to pass to the transcribe method.
+
+    Attributes:
+    - model (faster_whisper.WhisperModel): The underlying Whisper speech recognition model.
+    - transcribe_args (dict): Additional arguments used for transcribe method.
+
+    Methods:
+    - transcribe(audio_path): Transcribes an audio file and yields the resulting segments.
+    """
+
+    def __init__(self, model_args: dict, transcribe_args: dict):
+        # self.model = faster_whisper.WhisperModel(**model_args)
+        model_size = "base"
+        self.model = faster_whisper.WhisperModel(model_size, device="cuda")
+        self.transcribe_args = transcribe_args
+
+    def transcribe(self, audio_path: str):
+        """
+        Transcribes the specified audio file and yields the resulting segments.
+
+        Args:
+        - audio_path (str): The path to the audio file for transcription.
+
+        Yields:
+        - faster_whisper.TranscriptionSegment: An individual transcription segment.
+        """
+        warnings.filterwarnings("ignore")
+        segments, info = self.model.transcribe(audio_path, beam_size=5)
+        warnings.filterwarnings("default")
+
+        # Same precision as the Whisper timestamps.
+        total_duration = round(info.duration, 2)
+
+        with tqdm(total=total_duration, unit=" seconds") as pbar:
+            for segment in segments:
+                yield segment
+                pbar.update(segment.end - segment.start)
+            pbar.update(0)
--- a/bazarr-ai-sub-generator/utils/whisper.py
+++ b/bazarr-ai-sub-generator/utils/whisper.py
@ -50,7 +50,7 @@ class WhisperAI:
        # Set device for computation
        self.device = torch.device(device)
        # Load the Whisper model with the specified size
-        self.model = whisper.load_model("base").to(self.device)
+        self.model = whisper.load_model("base.en").to(self.device)
        # Store the additional transcription arguments
        self.transcribe_args = transcribe_args

--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,9 @@
 tqdm==4.56.0
 ffmpeg-python==0.2.0
 git+https://github.com/openai/whisper.git
+faster-whisper
+nvidia-cublas-cu12
+nvidia-cudnn-cu12
+nvidia-cublas-cu11
+nvidia-cudnn-cu11
+ctranslate2==3.24.0