Skip to content

Commit ce37cb5

Browse files
committed
Add transcript caching and refactor
1 parent 4045a92 commit ce37cb5

7 files changed

Lines changed: 572 additions & 397 deletions

File tree

youtubevideotranscriptbot/database.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,3 +186,99 @@ def get_summary_by_video_language(video_id, language, model=MODEL_TO_USE):
186186
async def get_summary_by_video_language_async(video_id, language, model=MODEL_TO_USE):
187187
loop = asyncio.get_event_loop()
188188
return await loop.run_in_executor(None, get_summary_by_video_language, video_id, language, model)
189+
190+
def insert_transcript(transcript_properties):
191+
"""
192+
Inserts a transcript record into the transcripts table.
193+
"""
194+
try:
195+
video_id = transcript_properties.get('video_id')
196+
video_title = transcript_properties.get('video_title')
197+
channel_name = transcript_properties.get('channel_name')
198+
channel_id = transcript_properties.get('channel_id')
199+
duration = transcript_properties.get('duration')
200+
video_url = transcript_properties.get('video_url')
201+
user_id = transcript_properties.get('user_id')
202+
language_code = transcript_properties.get('language_code')
203+
normalized_language_code = transcript_properties.get('normalized_language_code')
204+
is_generated = transcript_properties.get('is_generated', True)
205+
text = transcript_properties.get('text', '')
206+
filename = transcript_properties.get('filename', '')
207+
base_filename = transcript_properties.get('base_filename', '')
208+
type = transcript_properties.get('type', 'transcript')
209+
summary = transcript_properties.get('summary', '')
210+
word_count = transcript_properties.get('word_count', 0)
211+
tokens_used = transcript_properties.get('tokens_used', 0)
212+
estimated_cost = transcript_properties.get('estimated_cost', 0.0)
213+
model = transcript_properties.get('model', MODEL_TO_USE)
214+
215+
with db_cursor() as cursor:
216+
query = """
217+
INSERT INTO transcripts (
218+
video_id, video_title, channel_name, channel_id, duration, video_url,
219+
user_id, language_code, normalized_language_code,
220+
is_generated, text, filename, base_filename, type, summary, word_count,
221+
tokens_used, estimated_cost, model
222+
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
223+
"""
224+
cursor.execute(query, (
225+
video_id, video_title, channel_name, channel_id, duration, video_url,
226+
user_id, language_code, normalized_language_code,
227+
is_generated, text, filename, base_filename, type, summary, word_count,
228+
tokens_used, estimated_cost, model
229+
))
230+
logger.info(f"Inserted transcript for video_id={video_id}, user_id={user_id}, language_code={language_code}")
231+
except Exception as e:
232+
logger.error(f"Failed to insert transcript: {e}")
233+
raise
234+
235+
async def insert_transcript_async(transcript_properties):
236+
loop = asyncio.get_event_loop()
237+
await loop.run_in_executor(
238+
None,
239+
insert_transcript,transcript_properties
240+
)
241+
242+
def get_existing_transcripts(video_id, normalized_language_code=None):
243+
"""
244+
Retrieves all transcripts from the transcripts table for a given video_id and optional language_code.
245+
Returns a list of transcript dicts (can be empty if none found).
246+
"""
247+
try:
248+
with db_cursor() as cursor:
249+
if normalized_language_code:
250+
query = """
251+
SELECT video_id, video_title, channel_name, channel_id, duration, video_url,
252+
user_id, language_code, normalized_language_code,
253+
is_generated, text, filename, base_filename, type, summary, word_count,
254+
tokens_used, estimated_cost, model
255+
FROM transcripts
256+
WHERE video_id = %s AND (normalized_language_code = %s OR normalized_language_code = 'en' OR normalized_language_code = 'ru')
257+
ORDER BY id DESC
258+
"""
259+
cursor.execute(query, (video_id, normalized_language_code))
260+
else:
261+
query = """
262+
SELECT video_id, video_title, channel_name, channel_id, duration, video_url,
263+
user_id, language_code, normalized_language_code,
264+
is_generated, text, filename, base_filename, type, summary, word_count,
265+
tokens_used, estimated_cost, model
266+
FROM transcripts
267+
WHERE video_id = %s
268+
ORDER BY id DESC
269+
"""
270+
cursor.execute(query, (video_id,))
271+
272+
rows = cursor.fetchall()
273+
keys = ["video_id", "video_title", "channel_name", "channel_id", "duration", "video_url",
274+
"user_id", "language_code", "normalized_language_code",
275+
"is_generated", "text", "filename", "base_filename", "type", "summary", "word_count",
276+
"tokens_used", "estimated_cost", "model"]
277+
return [dict(zip(keys, row)) for row in rows]
278+
except Exception as e:
279+
logger.error(f"Failed to fetch transcripts for video_id={video_id}, language_code={normalized_language_code}: {e}")
280+
return []
281+
282+
async def get_existing_transcripts_async(video_id, normalized_language_code=None):
283+
loop = asyncio.get_event_loop()
284+
return await loop.run_in_executor(None, get_existing_transcripts, video_id, normalized_language_code)

youtubevideotranscriptbot/model_params.py

Lines changed: 17 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ def get_model_params(model=MODEL_TO_USE):
2525
2626
Returns:
2727
dict: A dictionary containing the model parameters.
28+
29+
Cost is indicated in USD per 100K tokens.
2830
"""
2931
if "gpt" in model.lower():
3032
logger.info(f"Using OpenAI {model} model.")
@@ -33,7 +35,9 @@ def get_model_params(model=MODEL_TO_USE):
3335
"max_chunks_allowed": 4,
3436
"max_tokens": 1024,
3537
"model": model or "gpt-4o-mini",
36-
"client": openai.OpenAI(api_key=OPENAI_API_KEY, base_url="https://api.openai.com/v1")
38+
"client": openai.OpenAI(api_key=OPENAI_API_KEY, base_url="https://api.openai.com/v1"),
39+
"cost_per_100k_tokens_input": 0.25,
40+
"cost_per_100k_tokens_output": 1 # Example cost, adjust as needed
3741
}
3842
elif "deepseek" in model.lower():
3943
logger.info(f"Using DeepSeek {model}.")
@@ -42,7 +46,9 @@ def get_model_params(model=MODEL_TO_USE):
4246
"max_chunks_allowed": 5,
4347
"max_tokens": 1024,
4448
"model": model or "deepseek-chat",
45-
"client": openai.OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")
49+
"client": openai.OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com"),
50+
"cost_per_100k_tokens_input": 0.027,
51+
"cost_per_100k_tokens_output": 0.11
4652
}
4753
elif "grok" in model.lower():
4854
logger.info(f"Using xAI Grok {model}.")
@@ -51,7 +57,9 @@ def get_model_params(model=MODEL_TO_USE):
5157
"max_chunks_allowed": 5,
5258
"max_tokens": 1024,
5359
"model": model or "grok-3-mini",
54-
"client": openai.OpenAI(api_key=XAI_API_KEY, base_url="https://api.x.ai/v1")
60+
"client": openai.OpenAI(api_key=XAI_API_KEY, base_url="https://api.x.ai/v1"),
61+
"cost_per_100k_tokens_input": 0.3,
62+
"cost_per_100k_tokens_output": 1.5
5563
}
5664
elif "claude" in model.lower():
5765
logger.info(f"Using Anthropic's Claude {model}.")
@@ -60,7 +68,9 @@ def get_model_params(model=MODEL_TO_USE):
6068
"max_chunks_allowed": 3,
6169
"max_tokens": 1024,
6270
"model": model or "claude-3-haiku-20240307",
63-
"client": openai.OpenAI(api_key=ANTHROPIC_API_KEY, base_url="https://api.anthropic.com/v1/")
71+
"client": openai.OpenAI(api_key=ANTHROPIC_API_KEY, base_url="https://api.anthropic.com/v1/"),
72+
"cost_per_100k_tokens_input": 0.25,
73+
"cost_per_100k_tokens_output": 1.5
6474
}
6575
else:
6676
logger.error("Invalid model selection in config. Please select 'gpt' for OpenAI or 'deepseek' for DeepSeek.")
@@ -70,35 +80,8 @@ def get_model_params(model=MODEL_TO_USE):
7080
"max_chunks_allowed": 5,
7181
"max_tokens": 1024,
7282
"model": model or "deepseek-chat",
73-
"client": openai.OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")
83+
"client": openai.OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com"),
84+
"cost_per_100k_tokens_input": 0.3,
85+
"cost_per_100k_tokens_output": 1.5
7486
}
7587

76-
77-
# if MODEL_TO_USE:
78-
# if "gpt" in MODEL_TO_USE.lower():
79-
# model_to_use = 1 # OpenAI model
80-
# elif "deepseek" in MODEL_TO_USE.lower():
81-
# model_to_use = 2
82-
# else:
83-
# logger.error("Invalid model selection in config. Please select 'gpt' for OpenAI or 'deepseek' for DeepSeek.")
84-
# logger.warning("Falling back to DeepSeek model as default.")
85-
# model_to_use = 2 # 1 for OpenAI, 2 for DeepSeek
86-
# raise ValueError("Invalid model selection in config. Please select 'gpt' for OpenAI or 'deepseek' for DeepSeek.")
87-
88-
# if model_to_use == 1:
89-
# tokens_per_chunk = 100000
90-
# max_chunks_allowed = 4
91-
# max_tokens = 1024
92-
# model = "gpt-4o-mini"
93-
# client = openai.OpenAI(api_key=OPENAI_API_KEY, base_url="https://api.openai.com/v1")
94-
# logger.info(f"Using OpenAI {model} model for summarization.")
95-
# elif model_to_use == 2:
96-
# tokens_per_chunk = 64000
97-
# max_chunks_allowed = 5
98-
# max_tokens = 1024
99-
# model = "deepseek-chat"
100-
# # for DeepSeek backward compatibility, you can still use `https://api.deepseek.com/v1` as `base_url`.
101-
# client = openai.OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com")
102-
# logger.info(f"Using DeepSeek {model} for summarization.")
103-
# else:
104-
# logger.error("Invalid model selection. Please select 1 for OpenAI or 2 for DeepSeek.")

0 commit comments

Comments
 (0)