Skip to content

Commit 3bdd0ea

Browse files
committed
Fix bug with producing summary from an incorrect cached transcript
1 parent 4fe54d5 commit 3bdd0ea

1 file changed

Lines changed: 66 additions & 20 deletions

File tree

youtubevideotranscriptbot/telegram_bot.py

Lines changed: 66 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,19 @@ async def handle_youtube_link(update: Update, context: CallbackContext):
333333

334334
if transcripts:
335335
context.user_data['video_id'] = video_id # Store video_id_id for later use
336-
context.user_data['transcripts'] = transcripts # Store transcript_request_id for later use
336+
337+
transcripts_in_context = []
338+
transcripts_in_context = context.user_data.get('transcripts', [])
339+
logger.info(f"Current transcripts in user context: {len(transcripts_in_context)}")
340+
341+
if transcripts_in_context and len(transcripts_in_context) > 0 and len(transcripts_in_context) < 10:
342+
logger.info(f"Appending new transcripts to user context. Current count: {len(transcripts_in_context)}")
343+
transcripts_in_context.extend(transcripts)
344+
logger.info(f"New transcripts count in user context: {len(transcripts_in_context)}") # Extend the existing list with new transcripts
345+
context.user_data['transcripts'] = transcripts_in_context
346+
else:
347+
logger.info(f"Clearing transcripts in user context and storing from scratch. New count: {len(transcripts)}")
348+
context.user_data['transcripts'] = transcripts # Store transcript_request_id for later use
337349

338350
try:
339351
for transcript in transcripts:
@@ -488,8 +500,30 @@ async def handle_summarization_button(update: Update, context: CallbackContext):
488500
video_id, language, transcript_request_id = query.data.split('&')[1:4]
489501
logger.info(f"The following language selected for summary: {language}")
490502

491-
base_filename = context.user_data.get('base_filename')
492-
transcripts = context.user_data.get('transcripts')
503+
# base_filename = context.user_data.get('base_filename')
504+
base_filename = 'unknown'
505+
transcripts = context.user_data.get('transcripts', [])
506+
507+
508+
logger.info(f"Summary requested for video id: {video_id}.")
509+
logger.info(f"Total number of transcripts found in user context: {len(transcripts)}")
510+
511+
if transcripts and len(transcripts) > 0:
512+
logger.info(f"The first available trascript video id: {transcripts[0].get('video_id', 'unknown')} and filename {transcripts[0].get('filename', 'unknown')}")
513+
514+
transcript = None
515+
516+
for t in transcripts:
517+
if t.get('video_id') == video_id and t.get('type') == 'transcript':
518+
logger.info(f"Found matching transcript for video {video_id} in user context.")
519+
transcript = t
520+
base_filename = t.get('base_filename')
521+
break
522+
523+
if not transcript:
524+
logger.info(f"No matching transcript found for video {video_id} in user context.")
525+
transcripts = None
526+
493527

494528
model = MODEL_TO_USE # Use the model specified in the config
495529

@@ -520,17 +554,17 @@ async def handle_summarization_button(update: Update, context: CallbackContext):
520554
}
521555
)
522556

523-
if not base_filename:
524-
logger.error("base_filename not found in user_data.")
525-
await query.edit_message_text("⚠️ Failed to generate summary. Missing file information.")
526-
return
557+
# if not base_filename:
558+
# logger.error("base_filename not found in user_data.")
559+
# await query.edit_message_text("⚠️ Failed to generate summary. Missing file information.")
560+
# return
527561

528562
await query.edit_message_text("🧠 Working on summary...")
529563

530564
summary = await get_summary_by_video_language_async(video_id=video_id, language=language, model=model)
531565

532566
if summary:
533-
logger.info(f"Summary already exists for video {video_id} in language {language} and model {model}.")
567+
logger.info(f"Summary already exists in DB for video {video_id} in language {language} and model {model}.")
534568

535569
track_event(
536570
user_id=user.id,
@@ -567,15 +601,14 @@ async def handle_summarization_button(update: Update, context: CallbackContext):
567601
return
568602

569603
elif transcripts:
570-
logger.info(f"No existing summary found for video {video_id} in language {language} and model {model}. Proceeding to generate a new summary.")
571-
logger.info(f"Proceeding with transcript from user context: {transcripts[0].get('filename', 'unknown')}")
572-
573-
transcript = transcripts[0]
604+
logger.info(f"No existing summary found in DB for video {video_id} in language {language} and model {model}. Proceeding to generate a new summary.")
605+
574606

575607
if transcript:
608+
logger.info(f"Proceeding with summary for transcript from user context with video id: {transcript.get('video_id')}.")
576609
original_language = transcript.get('normalized_language_code', 'en')
577610
text = transcript.get('text')
578-
logger.info(f"Transcript found in user contextfor original language: {original_language}")
611+
logger.info(f"Transcript found in user context for original language: {original_language}")
579612
# Handle summarization request
580613
logger.info(f"Starting summarization request from original '{original_language}' to target '{language}'")
581614
logger.info(f"Text length to summarize is {len(text.split())} words.")
@@ -609,8 +642,11 @@ async def handle_summarization_button(update: Update, context: CallbackContext):
609642
except Exception as e:
610643
logger.error(f"Failed to produce summary from data in user context: {e}")
611644

645+
else:
646+
logger.error(f"No transcript found in user context for video {video_id}. Failed to generate summary.")
647+
612648
else:
613-
logger.info(f"No transcripts found in user context. Proceeding to read from file.")
649+
logger.info(f"No transcripts found in user context for {video_id}. Proceeding to read from file on disk.")
614650

615651
try:
616652
# Ensure transcript_request_id is an integer
@@ -620,8 +656,18 @@ async def handle_summarization_button(update: Update, context: CallbackContext):
620656
transcript_folder = "transcripts"
621657

622658
try:
659+
660+
logger.info("Fetching video details to get the base_filename and determine original language.")
623661
# Get video details
624662
video_details = get_video_details(video_id)
663+
# Include channel name in the file name (truncated to 60 characters)
664+
channel_name = video_details['snippet']['channelTitle'][:60]
665+
# Include video title in the file name (truncated to 140 characters)
666+
video_title = video_details['snippet']['title'][:140]
667+
# Sanitize the base filename
668+
base_filename = sanitize_filename(f"{channel_name}_{video_title}")
669+
670+
625671
original_language = normalize_language_code(video_details['snippet']["defaultAudioLanguage"])
626672
if original_language:
627673
summary_properties['language'] = original_language
@@ -640,16 +686,16 @@ async def handle_summarization_button(update: Update, context: CallbackContext):
640686
transcript_filename = f"{transcript_folder}/{base_filename}_transcript_{original_language}.txt"
641687

642688
# If no transcript found in user context, read from file
643-
logger.info(f"Looking for transcript file: {transcript_filename}")
689+
logger.info(f"Looking for transcript file on disk: {transcript_filename}")
644690
if not os.path.exists(transcript_filename):
645-
logger.error(f"Transcript file not found: {transcript_filename}")
646-
raise FileNotFoundError(f"Transcript file not found: {transcript_filename}")
691+
logger.error(f"Transcript file not found on disk: {transcript_filename}")
692+
raise FileNotFoundError(f"Transcript file not found on disk: {transcript_filename}")
647693

648694
with open(transcript_filename, 'r', encoding='utf-8') as f:
649695
transcript = f.read()
650696

651697
# Handle summarization request
652-
logger.info(f"Starting summarization request from original '{original_language}' to target '{language}'")
698+
logger.info(f"Starting summarization request with file on disk from original '{original_language}' to target '{language}'")
653699
try:
654700
summary, tokens_used, estimated_cost, word_count, model = await handle_summarization_request(
655701
text=transcript,
@@ -680,7 +726,7 @@ async def handle_summarization_button(update: Update, context: CallbackContext):
680726

681727
except ValueError as e:
682728
logger.error(f"Invalid transcript_request_id: {transcript_request_id}. Error: {e}")
683-
await query.edit_message_text("⚠️ Failed to generate summary. Invalid request ID.")
729+
await query.edit_message_text("⚠️ Failed to generate summary. Please try again.")
684730
except FileNotFoundError as e:
685731
logger.error(f"Transcript file not found: {e}")
686732
await query.edit_message_text("⚠️ Failed to generate summary. Transcript file not found.")
@@ -774,7 +820,7 @@ async def handle_summarization_button(update: Update, context: CallbackContext):
774820
}
775821
)
776822

777-
await query.edit_message_text("⚠️ Failed to generate summary. Please try again later.")
823+
msg = await query.edit_message_text("⚠️ Failed to generate summary. Please try again later.")
778824
return
779825

780826

0 commit comments

Comments
 (0)