-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsetup_voices.py
More file actions
472 lines (402 loc) · 19.2 KB
/
setup_voices.py
File metadata and controls
472 lines (402 loc) · 19.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
#!/usr/bin/env python3
"""
Setup Voice Files for EchoForge
This script generates high-quality voice samples for the EchoForge application
using the CSM voice generation model. It creates a set of voice files with
different characteristics and places them in the correct locations.
Usage:
python setup_voices.py [--force] [--cpu]
Options:
--force Overwrite existing voice files if they exist
--cpu Force CPU usage instead of GPU (slower but works on all systems)
Requirements for high-quality voice generation:
1. csm-voice-cloning directory must be set up:
- This directory should contain: generator.py, models.py, etc.
2. Required Python packages (install with pip):
- torch==2.4.0
- torchaudio==2.4.0
- transformers==4.49.0
- huggingface_hub==0.28.1
- moshi==0.2.2
- torchtune==0.4.0
- torchao==0.9.0
- silentcipher (from GitHub)
You can install all required packages with:
pip install -r csm-voice-cloning/requirements.txt
The script will:
1. Create the necessary directory structure
2. Download the CSM voice generation model
3. Generate realistic voice samples with different characteristics
4. Place them in the correct locations for EchoForge to use
"""
import os
import sys
import argparse
import json
import time
import random
from pathlib import Path
# Handle the CSM model import
try:
# Try to import from csm-voice-cloning directory
sys.path.append(os.path.join(os.path.dirname(__file__), 'csm-voice-cloning'))
from generator import load_csm_1b, Segment
CSM_AVAILABLE = True
print("Successfully imported CSM modules from csm-voice-cloning")
except ImportError as e:
print(f"Warning: CSM voice generation modules not found: {e}")
print("Voice files will be generated using basic waveforms instead")
print("For high-quality voices, ensure the csm-voice-cloning directory is present")
print("and all dependencies are installed (see requirements.txt in that directory)")
CSM_AVAILABLE = False
# Try to import torch, required for CSM
try:
import torch
import torchaudio
from huggingface_hub import hf_hub_download
TORCH_AVAILABLE = True
except ImportError as e:
print(f"Warning: PyTorch not found: {e}")
print("Install required packages with: pip install -r csm-voice-cloning/requirements.txt")
TORCH_AVAILABLE = False
# Try to import other required packages for CSM
if CSM_AVAILABLE and TORCH_AVAILABLE:
try:
# These imports are just to check if the packages are available
import moshi
import transformers
import tokenizers
ALL_DEPS_AVAILABLE = True
except ImportError as e:
print(f"Warning: Additional dependencies for CSM not found: {e}")
print("Install required packages with: pip install -r csm-voice-cloning/requirements.txt")
ALL_DEPS_AVAILABLE = False
else:
ALL_DEPS_AVAILABLE = False
# Only if dependencies are unavailable, import numpy for basic waveform generation
if not ALL_DEPS_AVAILABLE:
import numpy as np
import wave
import struct
# Voice configurations with detailed parameters for CSM model
VOICE_CONFIGS = [
{
"id": 1,
"name": "Commander Sterling",
"gender": "male",
"description": "Deep authoritative voice with confident tone",
"speaker_id": 4, # Maps to CSM speaker with gravitas
"temperature": 0.8, # Lower temperature for more consistent output
"topk": 40,
"character_traits": "confident, commanding, authoritative",
"sample_text": "The mission briefing is clear. We must secure the perimeter and establish a base of operations immediately."
},
{
"id": 2,
"name": "Dr. Elise Jensen",
"gender": "female",
"description": "Clear, articulate voice with professional tone",
"speaker_id": 2, # Maps to CSM medium-pitched female
"temperature": 0.9,
"topk": 45,
"character_traits": "intelligent, precise, professional",
"sample_text": "The analysis confirms our hypothesis. The compound exhibits remarkable properties under these specific conditions."
},
{
"id": 3,
"name": "James Fletcher",
"gender": "male",
"description": "Warm, friendly voice with natural cadence",
"speaker_id": 0, # Maps to CSM clear articulation male
"temperature": 1.0,
"topk": 50,
"character_traits": "warm, friendly, approachable",
"sample_text": "It's great to see everyone here today. I'm looking forward to getting to know each of you better as we work together."
},
{
"id": 4,
"name": "Sophia Chen",
"gender": "female",
"description": "Energetic voice with varied intonation",
"speaker_id": 5, # Maps to CSM smooth female with warm tone
"temperature": 1.1, # Higher temperature for more variation
"topk": 50,
"character_traits": "energetic, expressive, dynamic",
"sample_text": "The innovation challenge starts now! Every idea counts, so don't hold back. Let's transform the way people think about this!"
},
{
"id": 5,
"name": "Morgan Riley",
"gender": "neutral",
"description": "Balanced, neutral voice with moderate tone",
"speaker_id": 3, # Maps to CSM higher pitched voice
"temperature": 0.9,
"topk": 45,
"character_traits": "balanced, neutral, versatile",
"sample_text": "Consider all perspectives before making your decision. Each option has its own merits and challenges to navigate."
}
]
# Alternative texts for generating additional voice samples for the same speaker
ALTERNATIVE_TEXTS = [
"Welcome to EchoForge, the advanced voice generation system that creates realistic speech.",
"The purpose of this demonstration is to showcase the capabilities of our voice synthesis technology.",
"This sample demonstrates the tonal qualities and speech patterns characteristic of this voice type.",
"Voice generation has many applications in creative media, accessibility, and user interfaces.",
"When properly implemented, synthetic voices can sound remarkably natural and expressive."
]
def create_basic_voice_audio(filename, voice_config, sample_rate=22050):
"""
Create a basic voice audio file using sine waves when CSM is not available.
This is a fallback method that creates distinctive but simple voice patterns.
"""
# Extract parameters from voice config
voice_id = voice_config["id"]
gender = voice_config["gender"]
# Set base frequency based on gender
if gender == "male":
base_freq = 120.0
elif gender == "female":
base_freq = 220.0
else: # neutral
base_freq = 165.0
# Adjust slightly based on voice ID for variety
base_freq *= (1.0 + (voice_id - 3) * 0.05)
# Parameters
duration = 4.0 # seconds
amplitude = 24000 # Volume (16-bit max is 32767)
num_samples = int(duration * sample_rate)
# Generate time array
t = np.linspace(0, duration, num_samples)
# Create fundamental frequency with slight variation
freq_mod = 1.0 + 0.01 * np.sin(2 * np.pi * 0.5 * t)
fundamental = amplitude * 0.5 * np.sin(2 * np.pi * base_freq * freq_mod * t)
# Add harmonics for richer voice quality
audio_data = fundamental.copy()
# Add several harmonics with decreasing amplitude
for harmonic in range(2, 6):
harmonic_amp = amplitude * 0.5 / harmonic
formant_freq = base_freq * harmonic * (0.9 if gender == "male" else 1.1 if gender == "female" else 1.0)
audio_data += harmonic_amp * np.sin(2 * np.pi * formant_freq * t)
# Add some noise for breathiness
noise = np.random.normal(0, amplitude * 0.05, num_samples)
audio_data += noise
# Apply envelope
envelope = np.ones(num_samples)
attack = int(0.1 * sample_rate)
release = int(0.2 * sample_rate)
# Attack phase
envelope[:attack] = np.linspace(0, 1, attack)
# Release phase
envelope[-release:] = np.linspace(1, 0, release)
audio_data = audio_data * envelope
# Ensure we stay within 16-bit range
audio_data = np.clip(audio_data, -32767, 32767).astype(np.int16)
# Create WAV file
os.makedirs(os.path.dirname(filename), exist_ok=True)
with wave.open(filename, 'w') as wav_file:
wav_file.setnchannels(1) # Mono
wav_file.setsampwidth(2) # 16 bit
wav_file.setframerate(sample_rate)
wav_file.writeframes(audio_data.tobytes())
print(f"Created basic voice file: {filename}")
return filename
def generate_csm_voice(generator, voice_config, output_filename):
"""
Generate a high-quality voice sample using the CSM model.
Args:
generator: CSM voice generator
voice_config: Dictionary with voice parameters
output_filename: Where to save the generated audio
"""
# Extract parameters
speaker_id = voice_config["speaker_id"]
temperature = voice_config["temperature"]
topk = voice_config["topk"]
text = voice_config["sample_text"]
print(f"Generating voice sample: {voice_config['name']} (speaker={speaker_id}, temp={temperature})")
# Generate audio with CSM
try:
audio = generator.generate(
text=text,
speaker=speaker_id,
context=[],
max_audio_length_ms=15000,
temperature=temperature,
topk=topk
)
# Save the audio
os.makedirs(os.path.dirname(output_filename), exist_ok=True)
torchaudio.save(output_filename, audio.unsqueeze(0).cpu(), generator.sample_rate)
print(f"Created CSM voice file: {output_filename}")
return True
except Exception as e:
print(f"Error generating CSM voice: {e}")
return False
def setup_directory_structure():
"""Create the necessary directory structure for voice files."""
# Define the main directories needed
directories = [
"static/voices/creative",
"static/voices/standard",
"static/images",
"static/samples"
]
# Create directories
for directory in directories:
os.makedirs(directory, exist_ok=True)
print(f"Ensured directory exists: {directory}")
def create_voice_metadata_file(voice_configs, output_path):
"""Create a metadata file with information about available voices."""
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
f.write("# EchoForge Voice Metadata\n")
f.write("# This file contains information about available voices\n\n")
for voice in voice_configs:
f.write(f"## Voice ID: {voice['id']}\n")
f.write(f"Name: {voice['name']}\n")
f.write(f"Gender: {voice['gender']}\n")
f.write(f"Description: {voice['description']}\n")
f.write(f"Character Traits: {voice['character_traits']}\n")
f.write(f"Sample Text: \"{voice['sample_text']}\"\n\n")
print(f"Created voice metadata file: {output_path}")
def create_default_sample_texts():
"""Create default sample text file for voice generation."""
sample_texts = {
"greetings": [
"Hello, it's nice to meet you. Welcome to EchoForge voice generation system.",
"Greetings and welcome! I'm excited to help you explore the world of voice synthesis.",
"Hi there! I'm a synthesized voice created by EchoForge. How can I assist you today?"
],
"information": [
"EchoForge is an advanced voice generation platform capable of creating realistic voices from text.",
"Voice synthesis technology has advanced significantly in recent years, enabling more natural-sounding speech.",
"This demo showcases the capabilities of EchoForge for generating human-like speech from text input."
],
"stories": [
"Once upon a time, in a distant land, there lived a wise old wizard who had mastered the art of voice magic.",
"The journey to the mountain's peak was arduous, but the view from the top made every step worthwhile.",
"As the sun set over the horizon, painting the sky in hues of orange and purple, a sense of calm settled over the landscape."
]
}
# Write sample texts to file
output_path = "static/sample_texts.txt"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w') as f:
f.write("# EchoForge Sample Texts\n")
f.write("# Use these texts to test voice generation\n\n")
for category, texts in sample_texts.items():
f.write(f"## {category.title()}\n")
for text in texts:
f.write(f"{text}\n")
f.write("\n")
print(f"Created sample texts file: {output_path}")
def main():
"""Main function to set up voice files for EchoForge."""
parser = argparse.ArgumentParser(description="Setup voice files for EchoForge")
parser.add_argument("--force", action="store_true", help="Overwrite existing voice files")
parser.add_argument("--cpu", action="store_true", help="Force CPU usage (slower but works on all systems)")
parser.add_argument("--skip-deps-check", action="store_true", help="Skip dependency checks and try to run anyway")
args = parser.parse_args()
print("\n=== EchoForge Voice Setup ===\n")
# Print dependency status
if not args.skip_deps_check:
print("Dependency Status:")
print(f" PyTorch Available: {'Yes' if TORCH_AVAILABLE else 'No'}")
print(f" CSM Modules Available: {'Yes' if CSM_AVAILABLE else 'No'}")
print(f" All Dependencies Available: {'Yes' if ALL_DEPS_AVAILABLE else 'No'}")
print("")
# Setup directory structure
setup_directory_structure()
# Target directory for voice files
target_dir = "static/voices/creative"
voice_metadata_path = "static/voices/voice_metadata.txt"
# Check if voice files already exist
existing_files = [f for f in os.listdir(target_dir) if f.endswith('.wav')] if os.path.exists(target_dir) else []
if existing_files and not args.force:
print(f"Found {len(existing_files)} existing voice files in {target_dir}")
print("Using existing voice files. Use --force to regenerate.")
else:
# Check if we have all the required dependencies for CSM
if ALL_DEPS_AVAILABLE or args.skip_deps_check:
# Determine device for CSM model
device = "cpu" if args.cpu else "cuda" if torch.cuda.is_available() else "cpu"
if device == "cpu" and not args.cpu:
print("CUDA not available, falling back to CPU (use --cpu to avoid this message)")
# Generate voice files using CSM
try:
# Download CSM model
print("Downloading CSM voice model...")
model_path = hf_hub_download(repo_id="sesame/csm-1b", filename="ckpt.pt")
print(f"Model downloaded to: {model_path}")
# Load CSM generator
print(f"Loading CSM model on {device}...")
generator = load_csm_1b(model_path, device)
print("CSM model loaded successfully")
# Generate voice files
print("\nGenerating high-quality voice files with CSM model...")
for voice_config in VOICE_CONFIGS:
voice_id = voice_config["id"]
voice_name = voice_config["name"].replace(" ", "_").lower()
# Generate primary voice sample
primary_filename = os.path.join(target_dir, f"voice_{voice_id}_{voice_name}.wav")
success = generate_csm_voice(generator, voice_config, primary_filename)
if success:
# Generate alternative versions with the same voice but different text
for i, alt_text in enumerate(ALTERNATIVE_TEXTS[:2]): # Just use 2 alternatives to save time
variant_config = voice_config.copy()
variant_config["sample_text"] = alt_text
# Slightly vary the parameters
variant_config["temperature"] = voice_config["temperature"] * (1.0 + (random.random() - 0.5) * 0.1)
variant_filename = os.path.join(
target_dir,
f"voice_{voice_id}_{voice_name}_variant{i+1}.wav"
)
generate_csm_voice(generator, variant_config, variant_filename)
print("Finished generating voices with CSM model")
except Exception as e:
print(f"Error with CSM voice generation: {e}")
print("Falling back to basic voice generation...")
generate_basic_voices(target_dir)
else:
print("\nUsing basic voice generation since CSM model dependencies are not available")
print("To use high-quality voice generation, install the required dependencies:")
print("pip install -r csm-voice-cloning/requirements.txt")
generate_basic_voices(target_dir)
# Create metadata file
create_voice_metadata_file(VOICE_CONFIGS, voice_metadata_path)
# Create sample texts file
create_default_sample_texts()
print("\n=== Voice Setup Complete ===")
print(f"Voice files are located in: {os.path.abspath(target_dir)}")
print("You can now use the EchoForge debug page to test voice generation:")
print(" 1. Start the server: ./start_server.sh")
print(" 2. Navigate to: http://localhost:8765/debug")
print(" 3. Select a speaker ID and enter text to generate")
print("\nEnjoy creating voices with EchoForge!")
def generate_basic_voices(target_dir):
"""Generate basic voice files when CSM is not available."""
print("\nGenerating basic voice files...")
for voice_config in VOICE_CONFIGS:
voice_id = voice_config["id"]
voice_name = voice_config["name"].replace(" ", "_").lower()
# Generate primary voice sample
primary_filename = os.path.join(target_dir, f"voice_{voice_id}_{voice_name}.wav")
create_basic_voice_audio(primary_filename, voice_config)
# Generate a couple of variants
for variant in range(1, 3):
# Make a copy of the config with slight variations
variant_config = voice_config.copy()
# Just modify the ID slightly to get different waveforms
variant_config["id"] = voice_config["id"] + variant * 0.1
variant_filename = os.path.join(
target_dir,
f"voice_{voice_id}_{voice_name}_variant{variant}.wav"
)
create_basic_voice_audio(variant_filename, variant_config)
print("Basic voice generation complete")
if __name__ == "__main__":
# Set a seed for reproducibility
random.seed(42)
if TORCH_AVAILABLE:
torch.manual_seed(42)
main()