Add PDF generation for long video descriptions

stellarnode · stellarnode · commit deff9075984d · 2025-07-20T12:21:45.000+03:00
diff --git a/youtubevideotranscriptbot/database.py b/youtubevideotranscriptbot/database.py
@@ -68,15 +68,16 @@ def store_video(video_id, video_details, subscribers):
     try:
         with db_cursor() as cursor:
             query = """
-            INSERT INTO videos (video_id, video_title, channel_name, subscribers, view_count, like_count, comment_count, description, video_link)
-            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
+            INSERT INTO videos (video_id, video_title, channel_name, subscribers, view_count, like_count, comment_count, duration, description, video_link)
+            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
             ON DUPLICATE KEY UPDATE
             video_title = VALUES(video_title),
             channel_name = VALUES(channel_name),
             subscribers = VALUES(subscribers),
             view_count = VALUES(view_count),
             like_count = VALUES(like_count),
             comment_count = VALUES(comment_count),
+            duration = VALUES(duration),
             description = VALUES(description),
             video_link = VALUES(video_link)
             """
@@ -89,6 +90,7 @@ def store_video(video_id, video_details, subscribers):
                 video_details['statistics'].get('viewCount', 'N/A'),
                 video_details['statistics'].get('likeCount', 'N/A'),
                 video_details['statistics'].get('commentCount', 'N/A'),
+                video_details.get('contentDetails', {}).get('duration', 'PT0S'),
                 video_details['snippet'].get('description', ''),
                 f"https://www.youtube.com/watch?v={video_id}"
             ))
@@ -284,4 +286,33 @@ def get_existing_transcripts(video_id, normalized_language_code=None):
 
 async def get_existing_transcripts_async(video_id, normalized_language_code=None):
     loop = asyncio.get_event_loop()
-    return await loop.run_in_executor(None, get_existing_transcripts, video_id, normalized_language_code)
+    return await loop.run_in_executor(None, get_existing_transcripts, video_id, normalized_language_code)
+
+
+async def get_video_by_id_async(video_id):
+    """
+    Asynchronously fetches a video object from the videos table by video_id.
+    Returns a dict with video details if found, else None.
+    """
+    def get_video_by_id(video_id):
+        try:
+            with db_cursor() as cursor:
+                query = """
+                SELECT video_id, video_title, channel_name, subscribers, view_count, like_count, comment_count, duration, description, video_link
+                FROM videos
+                WHERE video_id = %s
+                LIMIT 1
+                """
+                cursor.execute(query, (video_id,))
+                row = cursor.fetchone()
+                if row:
+                    keys = ["video_id", "video_title", "channel_name", "subscribers", "view_count", "like_count", "comment_count", "duration", "description", "video_link"]
+                    return dict(zip(keys, row))
+                else:
+                    return None
+        except Exception as e:
+            logger.error(f"Failed to fetch video for video_id={video_id}: {e}")
+            return None
+
+    loop = asyncio.get_event_loop()
+    return await loop.run_in_executor(None, get_video_by_id, video_id)
diff --git a/youtubevideotranscriptbot/languages.py b/youtubevideotranscriptbot/languages.py
@@ -0,0 +1,190 @@
+# languages.py
+# This file contains a dictionary of language codes and their corresponding names and native names.
+
+languages = {
+    "aa": {"name": "Afar", "nativeName": "Qafar af"},
+    "ab": {"name": "Abkhazian", "nativeName": "аҧсуа"},
+    "ae": {"name": "Avestan", "nativeName": "avesta"},
+    "af": {"name": "Afrikaans", "nativeName": "Afrikaans"},
+    "ak": {"name": "Akan", "nativeName": "Akan"},
+    "am": {"name": "Amharic", "nativeName": "አማርኛ"},
+    "an": {"name": "Aragonese", "nativeName": "aragonés"},
+    "ar": {"name": "Arabic", "nativeName": "العربية"},
+    "as": {"name": "Assamese", "nativeName": "অসমীয়া"},
+    "av": {"name": "Avaric", "nativeName": "авар мацӀ"},
+    "ay": {"name": "Aymara", "nativeName": "aymar aru"},
+    "az": {"name": "Azerbaijani", "nativeName": "azərbaycan dili"},
+    "ba": {"name": "Bashkir", "nativeName": "башҡорт теле"},
+    "be": {"name": "Belarusian", "nativeName": "беларуская"},
+    "bg": {"name": "Bulgarian", "nativeName": "български"},
+    "bh": {"name": "Bihari", "nativeName": "भोजपुरी"},
+    "bi": {"name": "Bislama", "nativeName": "Bislama"},
+    "bm": {"name": "Bambara", "nativeName": "bamanankan"},
+    "bn": {"name": "Bengali", "nativeName": "বাংলা"},
+    "bo": {"name": "Tibetan", "nativeName": "བོད་ཡིག"},
+    "br": {"name": "Breton", "nativeName": "brezhoneg"},
+    "bs": {"name": "Bosnian", "nativeName": "bosanski"},
+    "ca": {"name": "Catalan", "nativeName": "català"},
+    "ce": {"name": "Chechen", "nativeName": "нохчийн"},
+    "ch": {"name": "Chamorro", "nativeName": "Chamoru"},
+    "co": {"name": "Corsican", "nativeName": "corsu"},
+    "cr": {"name": "Cree", "nativeName": "ᓀᐦᐃᔭᐍᐏᐣ"},
+    "cs": {"name": "Czech", "nativeName": "čeština"},
+    "cu": {"name": "Church Slavic", "nativeName": "церковнославянский"},
+    "cv": {"name": "Chuvash", "nativeName": "чӑваш чӗлхи"},
+    "cy": {"name": "Welsh", "nativeName": "Cymraeg"},
+    "da": {"name": "Danish", "nativeName": "dansk"},
+    "de": {"name": "German", "nativeName": "Deutsch"},
+    "dv": {"name": "Divehi", "nativeName": "ދިވެހި"},
+    "dz": {"name": "Dzongkha", "nativeName": "རྫོང་ཁ"},
+    "ee": {"name": "Ewe", "nativeName": "Eʋegbe"},
+    "el": {"name": "Greek", "nativeName": "Ελληνικά"},
+    "en": {"name": "English", "nativeName": "English"},
+    "eo": {"name": "Esperanto", "nativeName": "Esperanto"},
+    "es": {"name": "Spanish", "nativeName": "español"},
+    "et": {"name": "Estonian", "nativeName": "eesti"},
+    "eu": {"name": "Basque", "nativeName": "euskara"},
+    "fa": {"name": "Persian", "nativeName": "فارسی"},
+    "ff": {"name": "Fulah", "nativeName": "Fulfulde"},
+    "fi": {"name": "Finnish", "nativeName": "suomi"},
+    "fj": {"name": "Fijian", "nativeName": "vosa Vakaviti"},
+    "fo": {"name": "Faroese", "nativeName": "føroyskt"},
+    "fr": {"name": "French", "nativeName": "français"},
+    "fy": {"name": "Western Frisian", "nativeName": "Frysk"},
+    "ga": {"name": "Irish", "nativeName": "Gaeilge"},
+    "gd": {"name": "Scottish Gaelic", "nativeName": "Gàidhlig"},
+    "gl": {"name": "Galician", "nativeName": "galego"},
+    "gn": {"name": "Guarani", "nativeName": "Avañe'ẽ"},
+    "gu": {"name": "Gujarati", "nativeName": "ગુજરાતી"},
+    "gv": {"name": "Manx", "nativeName": "Gaelg"},
+    "ha": {"name": "Hausa", "nativeName": "Hausa"},
+    "he": {"name": "Hebrew", "nativeName": "עברית"},
+    "hi": {"name": "Hindi", "nativeName": "हिन्दी"},
+    "ho": {"name": "Hiri Motu", "nativeName": "Hiri Motu"},
+    "hr": {"name": "Croatian", "nativeName": "hrvatski"},
+    "ht": {"name": "Haitian", "nativeName": "Kreyòl ayisyen"},
+    "hu": {"name": "Hungarian", "nativeName": "magyar"},
+    "hy": {"name": "Armenian", "nativeName": "Հայերեն"},
+    "hz": {"name": "Herero", "nativeName": "Otjiherero"},
+    "ia": {"name": "Interlingua", "nativeName": "Interlingua"},
+    "id": {"name": "Indonesian", "nativeName": "Bahasa Indonesia"},
+    "ie": {"name": "Interlingue", "nativeName": "Interlingue"},
+    "ig": {"name": "Igbo", "nativeName": "Asụsụ Igbo"},
+    "ii": {"name": "Sichuan Yi", "nativeName": "ꆇꉙ"},
+    "ik": {"name": "Inupiaq", "nativeName": "Iñupiaq"},
+    "io": {"name": "Ido", "nativeName": "Ido"},
+    "is": {"name": "Icelandic", "nativeName": "íslenska"},
+    "it": {"name": "Italian", "nativeName": "italiano"},
+    "iu": {"name": "Inuktitut", "nativeName": "ᐃᓄᒃᑎᑐᑦ"},
+    "ja": {"name": "Japanese", "nativeName": "日本語"},
+    "jv": {"name": "Javanese", "nativeName": "ꦧꦱꦗꦮ"},
+    "ka": {"name": "Georgian", "nativeName": "ქართული"},
+    "kg": {"name": "Kongo", "nativeName": "Kikongo"},
+    "ki": {"name": "Kikuyu", "nativeName": "Gĩkũyũ"},
+    "kj": {"name": "Kuanyama", "nativeName": "Kuanyama"},
+    "kk": {"name": "Kazakh", "nativeName": "қазақ тілі"},
+    "kl": {"name": "Kalaallisut", "nativeName": "kalaallisut"},
+    "km": {"name": "Khmer", "nativeName": "ខ្មែរ"},
+    "kn": {"name": "Kannada", "nativeName": "ಕನ್ನಡ"},
+    "ko": {"name": "Korean", "nativeName": "한국어"},
+    "kr": {"name": "Kanuri", "nativeName": "Kanuri"},
+    "ks": {"name": "Kashmiri", "nativeName": "कश्मीरी / كشميري"},
+    "ku": {"name": "Kurdish", "nativeName": "Kurdî"},
+    "kv": {"name": "Komi", "nativeName": "коми кыв"},
+    "kw": {"name": "Cornish", "nativeName": "Kernewek"},
+    "ky": {"name": "Kirghiz", "nativeName": "кыргыз тили"},
+    "la": {"name": "Latin", "nativeName": "latine"},
+    "lb": {"name": "Luxembourgish", "nativeName": "Lëtzebuergesch"},
+    "lg": {"name": "Ganda", "nativeName": "Luganda"},
+    "li": {"name": "Limburgan", "nativeName": "Limburgs"},
+    "ln": {"name": "Lingala", "nativeName": "Lingála"},
+    "lo": {"name": "Lao", "nativeName": "ພາສາລາວ"},
+    "lt": {"name": "Lithuanian", "nativeName": "lietuvių kalba"},
+    "lu": {"name": "Luba-Katanga", "nativeName": "Tshiluba"},
+    "lv": {"name": "Latvian", "nativeName": "latviešu valoda"},
+    "mg": {"name": "Malagasy", "nativeName": "fiteny malagasy"},
+    "mh": {"name": "Marshallese", "nativeName": "Kajin M̧ajeļ"},
+    "mi": {"name": "Māori", "nativeName": "te reo Māori"},
+    "mk": {"name": "Macedonian", "nativeName": "македонски"},
+    "ml": {"name": "Malayalam", "nativeName": "മലയാളം"},
+    "mn": {"name": "Mongolian", "nativeName": "монгол"},
+    "mr": {"name": "Marathi", "nativeName": "मराठी"},
+    "ms": {"name": "Malay", "nativeName": "Bahasa Melayu"},
+    "mt": {"name": "Maltese", "nativeName": "Malti"},
+    "my": {"name": "Burmese", "nativeName": "ဗမာစာ"},
+    "na": {"name": "Nauru", "nativeName": "Dorerin Naoero"},
+    "nb": {"name": "Norwegian Bokmål", "nativeName": "Bokmål"},
+    "nd": {"name": "North Ndebele", "nativeName": "isiNdebele"},
+    "ne": {"name": "Nepali", "nativeName": "नेपाली"},
+    "ng": {"name": "Ndonga", "nativeName": "Owambo"},
+    "nl": {"name": "Dutch", "nativeName": "Nederlands"},
+    "nn": {"name": "Norwegian Nynorsk", "nativeName": "Nynorsk"},
+    "no": {"name": "Norwegian", "nativeName": "Norsk"},
+    "nr": {"name": "South Ndebele", "nativeName": "isiNdebele"},
+    "nv": {"name": "Navajo", "nativeName": "Diné bizaad"},
+    "ny": {"name": "Chichewa", "nativeName": "chiCheŵa"},
+    "oc": {"name": "Occitan", "nativeName": "occitan"},
+    "oj": {"name": "Ojibwa", "nativeName": "Anishinaabemowin"},
+    "om": {"name": "Oromo", "nativeName": "Afaan Oromoo"},
+    "or": {"name": "Oriya", "nativeName": "ଓଡ଼ିଆ"},
+    "os": {"name": "Ossetian", "nativeName": "ирон æвзаг"},
+    "pa": {"name": "Punjabi", "nativeName": "ਪੰਜਾਬੀ"},
+    "pi": {"name": "Pali", "nativeName": "पाऴि"},
+    "pl": {"name": "Polish", "nativeName": "polski"},
+    "ps": {"name": "Pashto", "nativeName": "پښتو"},
+    "pt": {"name": "Portuguese", "nativeName": "português"},
+    "qu": {"name": "Quechua", "nativeName": "Runa Simi"},
+    "rm": {"name": "Romansh", "nativeName": "rumantsch"},
+    "rn": {"name": "Rundi", "nativeName": "Ikirundi"},
+    "ro": {"name": "Romanian", "nativeName": "română"},
+    "ru": {"name": "Russian", "nativeName": "русский"},
+    "rw": {"name": "Kinyarwanda", "nativeName": "Ikinyarwanda"},
+    "sa": {"name": "Sanskrit", "nativeName": "संस्कृतम्"},
+    "sc": {"name": "Sardinian", "nativeName": "sardu"},
+    "sd": {"name": "Sindhi", "nativeName": "سنڌي"},
+    "se": {"name": "Northern Sami", "nativeName": "Davvisámegiella"},
+    "sg": {"name": "Sango", "nativeName": "yângâ tî sängö"},
+    "si": {"name": "Sinhala", "nativeName": "සිංහල"},
+    "sk": {"name": "Slovak", "nativeName": "slovenčina"},
+    "sl": {"name": "Slovenian", "nativeName": "slovenščina"},
+    "sm": {"name": "Samoan", "nativeName": "gagana fa'a Samoa"},
+    "sn": {"name": "Shona", "nativeName": "chiShona"},
+    "so": {"name": "Somali", "nativeName": "Soomaaliga"},
+    "sq": {"name": "Albanian", "nativeName": "Shqip"},
+    "sr": {"name": "Serbian", "nativeName": "српски"},
+    "ss": {"name": "Swati", "nativeName": "SiSwati"},
+    "st": {"name": "Southern Sotho", "nativeName": "Sesotho"},
+    "su": {"name": "Sundanese", "nativeName": "Basa Sunda"},
+    "sv": {"name": "Swedish", "nativeName": "svenska"},
+    "sw": {"name": "Swahili", "nativeName": "Kiswahili"},
+    "ta": {"name": "Tamil", "nativeName": "தமிழ்"},
+    "te": {"name": "Telugu", "nativeName": "తెలుగు"},
+    "tg": {"name": "Tajik", "nativeName": "тоҷикӣ"},
+    "th": {"name": "Thai", "nativeName": "ไทย"},
+    "ti": {"name": "Tigrinya", "nativeName": "ትግርኛ"},
+    "tk": {"name": "Turkmen", "nativeName": "türkmen dili"},
+    "tl": {"name": "Tagalog", "nativeName": "Wikang Tagalog"},
+    "tn": {"name": "Tswana", "nativeName": "Setswana"},
+    "to": {"name": "Tonga", "nativeName": "faka Tonga"},
+    "tr": {"name": "Turkish", "nativeName": "Türkçe"},
+    "ts": {"name": "Tsonga", "nativeName": "Xitsonga"},
+    "tt": {"name": "Tatar", "nativeName": "татарча"},
+    "tw": {"name": "Twi", "nativeName": "Twi"},
+    "ty": {"name": "Tahitian", "nativeName": "Reo Tahiti"},
+    "ug": {"name": "Uighur", "nativeName": "ئۇيغۇرچە"},
+    "uk": {"name": "Ukrainian", "nativeName": "українська"},
+    "ur": {"name": "Urdu", "nativeName": "اردو"},
+    "uz": {"name": "Uzbek", "nativeName": "o‘zbek tili"},
+    "ve": {"name": "Venda", "nativeName": "Tshivenda"},
+    "vi": {"name": "Vietnamese", "nativeName": "Tiếng Việt"},
+    "vo": {"name": "Volapük", "nativeName": "Volapük"},
+    "wa": {"name": "Walloon", "nativeName": "walon"},
+    "wo": {"name": "Wolof", "nativeName": "Wollof"},
+    "xh": {"name": "Xhosa", "nativeName": "isiXhosa"},
+    "yi": {"name": "Yiddish", "nativeName": "ייִדיש"},
+    "yo": {"name": "Yoruba", "nativeName": "Yorùbá"},
+    "za": {"name": "Zhuang", "nativeName": "Saɯ cueŋƅ"},
+    "zh": {"name": "Chinese", "nativeName": "中文"},
+    "zu": {"name": "Zulu", "nativeName": "isiZulu"},
+}
+
diff --git a/youtubevideotranscriptbot/requirements.txt b/youtubevideotranscriptbot/requirements.txt
@@ -11,5 +11,6 @@ aiofiles>=0.4.0
 amplitude-analytics==1.1.5
 supadata==1.2.1
 python-dotenv
+weasyprint>=60.2
 
 
diff --git a/youtubevideotranscriptbot/run_bot.sh b/youtubevideotranscriptbot/run_bot.sh
@@ -24,4 +24,4 @@ docker run -d --name ${CONTAINER_NAME} \
   --restart unless-stopped \
   ${IMAGE_TAG}
 
-echo "Container started successfully!"
+echo "Container started successfully!"
diff --git a/youtubevideotranscriptbot/summarize.py b/youtubevideotranscriptbot/summarize.py
@@ -47,7 +47,7 @@ def _summarize_sync(chunk, language, model=MODEL_TO_USE):
         system_role = "You are a master of extracting pearls of knowledge from YouTube video transcripts. You grasp the very essence and distill it in a concise form for users. You always provide the response in the same language in which the transcript is provided. Your answers are always clear, concise and nicely formatted."
         prompt = (
             f"If I did not have time to read this YouTube video transcript, what are the most important things I absolutely must know. Enlighten me in no more than 200 words. "
-            f"Always provide your response in the same language as the transcript. In this case it might be in '{language}' language. Here is the transcript itself:\n\n{chunk}"
+            f"Always provide your response in the same language as the transcript. In this case it might be in '{language}' language. Use emojis to structure the summary if and when appropriate. Never reference this prompt. Get straight to the point without intros. Here is the transcript itself:\n\n{chunk}"
         )
         # prompt = (
         #     f"If I did not have time to read this YouTube video transcript, what are the most important things I absolutely must know. Enlighten me in no more than 200 words. "
diff --git a/youtubevideotranscriptbot/telegram_bot.py b/youtubevideotranscriptbot/telegram_bot.py
diff --git a/youtubevideotranscriptbot/utils.py b/youtubevideotranscriptbot/utils.py

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ def _summarize_sync(chunk, language, model=MODEL_TO_USE):`
`47`	`47`	`system_role = "You are a master of extracting pearls of knowledge from YouTube video transcripts. You grasp the very essence and distill it in a concise form for users. You always provide the response in the same language in which the transcript is provided. Your answers are always clear, concise and nicely formatted."`
`48`	`48`	`prompt = (`
`49`	`49`	`f"If I did not have time to read this YouTube video transcript, what are the most important things I absolutely must know. Enlighten me in no more than 200 words. "`
`50`		`- f"Always provide your response in the same language as the transcript. In this case it might be in '{language}' language. Here is the transcript itself:\n\n{chunk}"`
	`50`	`+ f"Always provide your response in the same language as the transcript. In this case it might be in '{language}' language. Use emojis to structure the summary if and when appropriate. Never reference this prompt. Get straight to the point without intros. Here is the transcript itself:\n\n{chunk}"`
`51`	`51`	`)`
`52`	`52`	`# prompt = (`
`53`	`53`	`# f"If I did not have time to read this YouTube video transcript, what are the most important things I absolutely must know. Enlighten me in no more than 200 words. "`