mirror of
https://github.com/karl0ss/bazarr-ai-sub-generator.git
synced 2025-04-26 06:49:22 +01:00
more updates, working with whisper and faster-whipser
This commit is contained in:
parent
281b3cabc1
commit
966fa639c4
22
.vscode/launch.json
vendored
22
.vscode/launch.json
vendored
@ -6,19 +6,31 @@
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python Debugger: Current File",
|
||||
"type": "debugpy",
|
||||
"type": "python",
|
||||
"request": "launch",
|
||||
"program": "${file}",
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false,
|
||||
"env": {
|
||||
"CUDA_VISIBLE_DEVICES": "1"
|
||||
"CUDA_VISIBLE_DEVICES": "1",
|
||||
"LD_LIBRARY_PATH": "/home/karl/faster-auto-subtitle/venv/lib/python3.11/site-packages/nvidia/cublas/lib:/home/karl/faster-auto-subtitle/venv/lib/python3.11/site-packages/nvidia/cudnn/lib"
|
||||
},
|
||||
"args": [
|
||||
"--model",
|
||||
"base",
|
||||
"--show",
|
||||
"Gary Neville's Soccerbox"
|
||||
"base"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Current (withenv)",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "${workspaceFolder}/run_with_env.sh",
|
||||
"console": "integratedTerminal",
|
||||
"justMyCode": false,
|
||||
"args": [
|
||||
"${file}",
|
||||
"--model",
|
||||
"base"
|
||||
]
|
||||
}
|
||||
]
|
||||
|
@ -55,24 +55,6 @@ def main():
|
||||
# help="Type to use for computation. \
|
||||
# See https://opennmt.net/CTranslate2/quantization.html.",
|
||||
# )
|
||||
# parser.add_argument(
|
||||
# "--beam_size",
|
||||
# type=int,
|
||||
# default=5,
|
||||
# help="model parameter, tweak to increase accuracy",
|
||||
# )
|
||||
# parser.add_argument(
|
||||
# "--no_speech_threshold",
|
||||
# type=float,
|
||||
# default=0.6,
|
||||
# help="model parameter, tweak to increase accuracy",
|
||||
# )
|
||||
# parser.add_argument(
|
||||
# "--condition_on_previous_text",
|
||||
# type=str2bool,
|
||||
# default=True,
|
||||
# help="model parameter, tweak to increase accuracy",
|
||||
# )
|
||||
parser.add_argument(
|
||||
"--show",
|
||||
type=str,
|
||||
|
@ -6,6 +6,7 @@ from utils.files import filename, write_srt
|
||||
from utils.ffmpeg import get_audio, add_subtitles_to_mp4
|
||||
from utils.bazarr import get_wanted_episodes, get_episode_details, sync_series
|
||||
from utils.sonarr import update_show_in_sonarr
|
||||
# from utils.faster_whisper import WhisperAI
|
||||
from utils.whisper import WhisperAI
|
||||
from utils.decorator import measure_time
|
||||
|
||||
@ -34,13 +35,16 @@ def process(args: dict):
|
||||
for episode in list_of_episodes_needing_subtitles["data"]:
|
||||
print(f"Processing {episode['seriesTitle']} - {episode['episode_number']}")
|
||||
episode_data = get_episode_details(episode["sonarrEpisodeId"])
|
||||
audios = get_audio([episode_data["path"]], 0, None)
|
||||
subtitles = get_subtitles(audios, tempfile.gettempdir(), model_args, args)
|
||||
try:
|
||||
audios = get_audio([episode_data["path"]], 0, None)
|
||||
subtitles = get_subtitles(audios, tempfile.gettempdir(), model_args, args)
|
||||
|
||||
add_subtitles_to_mp4(subtitles)
|
||||
update_show_in_sonarr(episode["sonarrSeriesId"])
|
||||
time.sleep(5)
|
||||
sync_series()
|
||||
add_subtitles_to_mp4(subtitles)
|
||||
update_show_in_sonarr(episode["sonarrSeriesId"])
|
||||
time.sleep(5)
|
||||
sync_series()
|
||||
except Exception as ex:
|
||||
print(f"skipping file due to - {ex}")
|
||||
|
||||
@measure_time
|
||||
def get_subtitles(
|
||||
|
68
bazarr-ai-sub-generator/utils/faster_whisper.py
Normal file
68
bazarr-ai-sub-generator/utils/faster_whisper.py
Normal file
@ -0,0 +1,68 @@
|
||||
import warnings
|
||||
import faster_whisper
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
# pylint: disable=R0903
|
||||
class WhisperAI:
|
||||
"""
|
||||
Wrapper class for the Whisper speech recognition model with additional functionality.
|
||||
|
||||
This class provides a high-level interface for transcribing audio files using the Whisper
|
||||
speech recognition model. It encapsulates the model instantiation and transcription process,
|
||||
allowing users to easily transcribe audio files and iterate over the resulting segments.
|
||||
|
||||
Usage:
|
||||
```python
|
||||
whisper = WhisperAI(model_args, transcribe_args)
|
||||
|
||||
# Transcribe an audio file and iterate over the segments
|
||||
for segment in whisper.transcribe(audio_path):
|
||||
# Process each transcription segment
|
||||
print(segment)
|
||||
```
|
||||
|
||||
Args:
|
||||
- model_args: Arguments to pass to WhisperModel initialize method
|
||||
- model_size_or_path (str): The name of the Whisper model to use.
|
||||
- device (str): The device to use for computation ("cpu", "cuda", "auto").
|
||||
- compute_type (str): The type to use for computation.
|
||||
See https://opennmt.net/CTranslate2/quantization.html.
|
||||
- transcribe_args (dict): Additional arguments to pass to the transcribe method.
|
||||
|
||||
Attributes:
|
||||
- model (faster_whisper.WhisperModel): The underlying Whisper speech recognition model.
|
||||
- transcribe_args (dict): Additional arguments used for transcribe method.
|
||||
|
||||
Methods:
|
||||
- transcribe(audio_path): Transcribes an audio file and yields the resulting segments.
|
||||
"""
|
||||
|
||||
def __init__(self, model_args: dict, transcribe_args: dict):
|
||||
# self.model = faster_whisper.WhisperModel(**model_args)
|
||||
model_size = "base"
|
||||
self.model = faster_whisper.WhisperModel(model_size, device="cuda")
|
||||
self.transcribe_args = transcribe_args
|
||||
|
||||
def transcribe(self, audio_path: str):
|
||||
"""
|
||||
Transcribes the specified audio file and yields the resulting segments.
|
||||
|
||||
Args:
|
||||
- audio_path (str): The path to the audio file for transcription.
|
||||
|
||||
Yields:
|
||||
- faster_whisper.TranscriptionSegment: An individual transcription segment.
|
||||
"""
|
||||
warnings.filterwarnings("ignore")
|
||||
segments, info = self.model.transcribe(audio_path, beam_size=5)
|
||||
warnings.filterwarnings("default")
|
||||
|
||||
# Same precision as the Whisper timestamps.
|
||||
total_duration = round(info.duration, 2)
|
||||
|
||||
with tqdm(total=total_duration, unit=" seconds") as pbar:
|
||||
for segment in segments:
|
||||
yield segment
|
||||
pbar.update(segment.end - segment.start)
|
||||
pbar.update(0)
|
@ -50,7 +50,7 @@ class WhisperAI:
|
||||
# Set device for computation
|
||||
self.device = torch.device(device)
|
||||
# Load the Whisper model with the specified size
|
||||
self.model = whisper.load_model("base").to(self.device)
|
||||
self.model = whisper.load_model("base.en").to(self.device)
|
||||
# Store the additional transcription arguments
|
||||
self.transcribe_args = transcribe_args
|
||||
|
||||
|
@ -1,3 +1,9 @@
|
||||
tqdm==4.56.0
|
||||
ffmpeg-python==0.2.0
|
||||
git+https://github.com/openai/whisper.git
|
||||
faster-whisper
|
||||
nvidia-cublas-cu12
|
||||
nvidia-cudnn-cu12
|
||||
nvidia-cublas-cu11
|
||||
nvidia-cudnn-cu11
|
||||
ctranslate2==3.24.0
|
Loading…
x
Reference in New Issue
Block a user