more updates, working with whisper and faster-whipser

This commit is contained in:
Karl 2024-07-16 07:31:28 +00:00
parent 281b3cabc1
commit 966fa639c4
6 changed files with 102 additions and 30 deletions

22
.vscode/launch.json vendored
View File

@ -6,19 +6,31 @@
"configurations": [ "configurations": [
{ {
"name": "Python Debugger: Current File", "name": "Python Debugger: Current File",
"type": "debugpy", "type": "python",
"request": "launch", "request": "launch",
"program": "${file}", "program": "${file}",
"console": "integratedTerminal", "console": "integratedTerminal",
"justMyCode": false, "justMyCode": false,
"env": { "env": {
"CUDA_VISIBLE_DEVICES": "1" "CUDA_VISIBLE_DEVICES": "1",
"LD_LIBRARY_PATH": "/home/karl/faster-auto-subtitle/venv/lib/python3.11/site-packages/nvidia/cublas/lib:/home/karl/faster-auto-subtitle/venv/lib/python3.11/site-packages/nvidia/cudnn/lib"
}, },
"args": [ "args": [
"--model", "--model",
"base", "base"
"--show", ]
"Gary Neville's Soccerbox" },
{
"name": "Current (withenv)",
"type": "debugpy",
"request": "launch",
"program": "${workspaceFolder}/run_with_env.sh",
"console": "integratedTerminal",
"justMyCode": false,
"args": [
"${file}",
"--model",
"base"
] ]
} }
] ]

View File

@ -55,24 +55,6 @@ def main():
# help="Type to use for computation. \ # help="Type to use for computation. \
# See https://opennmt.net/CTranslate2/quantization.html.", # See https://opennmt.net/CTranslate2/quantization.html.",
# ) # )
# parser.add_argument(
# "--beam_size",
# type=int,
# default=5,
# help="model parameter, tweak to increase accuracy",
# )
# parser.add_argument(
# "--no_speech_threshold",
# type=float,
# default=0.6,
# help="model parameter, tweak to increase accuracy",
# )
# parser.add_argument(
# "--condition_on_previous_text",
# type=str2bool,
# default=True,
# help="model parameter, tweak to increase accuracy",
# )
parser.add_argument( parser.add_argument(
"--show", "--show",
type=str, type=str,

View File

@ -6,6 +6,7 @@ from utils.files import filename, write_srt
from utils.ffmpeg import get_audio, add_subtitles_to_mp4 from utils.ffmpeg import get_audio, add_subtitles_to_mp4
from utils.bazarr import get_wanted_episodes, get_episode_details, sync_series from utils.bazarr import get_wanted_episodes, get_episode_details, sync_series
from utils.sonarr import update_show_in_sonarr from utils.sonarr import update_show_in_sonarr
# from utils.faster_whisper import WhisperAI
from utils.whisper import WhisperAI from utils.whisper import WhisperAI
from utils.decorator import measure_time from utils.decorator import measure_time
@ -34,13 +35,16 @@ def process(args: dict):
for episode in list_of_episodes_needing_subtitles["data"]: for episode in list_of_episodes_needing_subtitles["data"]:
print(f"Processing {episode['seriesTitle']} - {episode['episode_number']}") print(f"Processing {episode['seriesTitle']} - {episode['episode_number']}")
episode_data = get_episode_details(episode["sonarrEpisodeId"]) episode_data = get_episode_details(episode["sonarrEpisodeId"])
audios = get_audio([episode_data["path"]], 0, None) try:
subtitles = get_subtitles(audios, tempfile.gettempdir(), model_args, args) audios = get_audio([episode_data["path"]], 0, None)
subtitles = get_subtitles(audios, tempfile.gettempdir(), model_args, args)
add_subtitles_to_mp4(subtitles) add_subtitles_to_mp4(subtitles)
update_show_in_sonarr(episode["sonarrSeriesId"]) update_show_in_sonarr(episode["sonarrSeriesId"])
time.sleep(5) time.sleep(5)
sync_series() sync_series()
except Exception as ex:
print(f"skipping file due to - {ex}")
@measure_time @measure_time
def get_subtitles( def get_subtitles(

View File

@ -0,0 +1,68 @@
import warnings
import faster_whisper
from tqdm import tqdm
# pylint: disable=R0903
class WhisperAI:
"""
Wrapper class for the Whisper speech recognition model with additional functionality.
This class provides a high-level interface for transcribing audio files using the Whisper
speech recognition model. It encapsulates the model instantiation and transcription process,
allowing users to easily transcribe audio files and iterate over the resulting segments.
Usage:
```python
whisper = WhisperAI(model_args, transcribe_args)
# Transcribe an audio file and iterate over the segments
for segment in whisper.transcribe(audio_path):
# Process each transcription segment
print(segment)
```
Args:
- model_args: Arguments to pass to WhisperModel initialize method
- model_size_or_path (str): The name of the Whisper model to use.
- device (str): The device to use for computation ("cpu", "cuda", "auto").
- compute_type (str): The type to use for computation.
See https://opennmt.net/CTranslate2/quantization.html.
- transcribe_args (dict): Additional arguments to pass to the transcribe method.
Attributes:
- model (faster_whisper.WhisperModel): The underlying Whisper speech recognition model.
- transcribe_args (dict): Additional arguments used for transcribe method.
Methods:
- transcribe(audio_path): Transcribes an audio file and yields the resulting segments.
"""
def __init__(self, model_args: dict, transcribe_args: dict):
# self.model = faster_whisper.WhisperModel(**model_args)
model_size = "base"
self.model = faster_whisper.WhisperModel(model_size, device="cuda")
self.transcribe_args = transcribe_args
def transcribe(self, audio_path: str):
"""
Transcribes the specified audio file and yields the resulting segments.
Args:
- audio_path (str): The path to the audio file for transcription.
Yields:
- faster_whisper.TranscriptionSegment: An individual transcription segment.
"""
warnings.filterwarnings("ignore")
segments, info = self.model.transcribe(audio_path, beam_size=5)
warnings.filterwarnings("default")
# Same precision as the Whisper timestamps.
total_duration = round(info.duration, 2)
with tqdm(total=total_duration, unit=" seconds") as pbar:
for segment in segments:
yield segment
pbar.update(segment.end - segment.start)
pbar.update(0)

View File

@ -50,7 +50,7 @@ class WhisperAI:
# Set device for computation # Set device for computation
self.device = torch.device(device) self.device = torch.device(device)
# Load the Whisper model with the specified size # Load the Whisper model with the specified size
self.model = whisper.load_model("base").to(self.device) self.model = whisper.load_model("base.en").to(self.device)
# Store the additional transcription arguments # Store the additional transcription arguments
self.transcribe_args = transcribe_args self.transcribe_args = transcribe_args

View File

@ -1,3 +1,9 @@
tqdm==4.56.0 tqdm==4.56.0
ffmpeg-python==0.2.0 ffmpeg-python==0.2.0
git+https://github.com/openai/whisper.git git+https://github.com/openai/whisper.git
faster-whisper
nvidia-cublas-cu12
nvidia-cudnn-cu12
nvidia-cublas-cu11
nvidia-cudnn-cu11
ctranslate2==3.24.0