Skip to content

Commit deff907

Browse files
committed
Add PDF generation for long video descriptions
1 parent d96edbf commit deff907

7 files changed

Lines changed: 515 additions & 15 deletions

File tree

youtubevideotranscriptbot/database.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,15 +68,16 @@ def store_video(video_id, video_details, subscribers):
6868
try:
6969
with db_cursor() as cursor:
7070
query = """
71-
INSERT INTO videos (video_id, video_title, channel_name, subscribers, view_count, like_count, comment_count, description, video_link)
72-
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
71+
INSERT INTO videos (video_id, video_title, channel_name, subscribers, view_count, like_count, comment_count, duration, description, video_link)
72+
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
7373
ON DUPLICATE KEY UPDATE
7474
video_title = VALUES(video_title),
7575
channel_name = VALUES(channel_name),
7676
subscribers = VALUES(subscribers),
7777
view_count = VALUES(view_count),
7878
like_count = VALUES(like_count),
7979
comment_count = VALUES(comment_count),
80+
duration = VALUES(duration),
8081
description = VALUES(description),
8182
video_link = VALUES(video_link)
8283
"""
@@ -89,6 +90,7 @@ def store_video(video_id, video_details, subscribers):
8990
video_details['statistics'].get('viewCount', 'N/A'),
9091
video_details['statistics'].get('likeCount', 'N/A'),
9192
video_details['statistics'].get('commentCount', 'N/A'),
93+
video_details.get('contentDetails', {}).get('duration', 'PT0S'),
9294
video_details['snippet'].get('description', ''),
9395
f"https://www.youtube.com/watch?v={video_id}"
9496
))
@@ -284,4 +286,33 @@ def get_existing_transcripts(video_id, normalized_language_code=None):
284286

285287
async def get_existing_transcripts_async(video_id, normalized_language_code=None):
286288
loop = asyncio.get_event_loop()
287-
return await loop.run_in_executor(None, get_existing_transcripts, video_id, normalized_language_code)
289+
return await loop.run_in_executor(None, get_existing_transcripts, video_id, normalized_language_code)
290+
291+
292+
async def get_video_by_id_async(video_id):
293+
"""
294+
Asynchronously fetches a video object from the videos table by video_id.
295+
Returns a dict with video details if found, else None.
296+
"""
297+
def get_video_by_id(video_id):
298+
try:
299+
with db_cursor() as cursor:
300+
query = """
301+
SELECT video_id, video_title, channel_name, subscribers, view_count, like_count, comment_count, duration, description, video_link
302+
FROM videos
303+
WHERE video_id = %s
304+
LIMIT 1
305+
"""
306+
cursor.execute(query, (video_id,))
307+
row = cursor.fetchone()
308+
if row:
309+
keys = ["video_id", "video_title", "channel_name", "subscribers", "view_count", "like_count", "comment_count", "duration", "description", "video_link"]
310+
return dict(zip(keys, row))
311+
else:
312+
return None
313+
except Exception as e:
314+
logger.error(f"Failed to fetch video for video_id={video_id}: {e}")
315+
return None
316+
317+
loop = asyncio.get_event_loop()
318+
return await loop.run_in_executor(None, get_video_by_id, video_id)
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
# languages.py
2+
# This file contains a dictionary of language codes and their corresponding names and native names.
3+
4+
languages = {
5+
"aa": {"name": "Afar", "nativeName": "Qafar af"},
6+
"ab": {"name": "Abkhazian", "nativeName": "аҧсуа"},
7+
"ae": {"name": "Avestan", "nativeName": "avesta"},
8+
"af": {"name": "Afrikaans", "nativeName": "Afrikaans"},
9+
"ak": {"name": "Akan", "nativeName": "Akan"},
10+
"am": {"name": "Amharic", "nativeName": "አማርኛ"},
11+
"an": {"name": "Aragonese", "nativeName": "aragonés"},
12+
"ar": {"name": "Arabic", "nativeName": "العربية"},
13+
"as": {"name": "Assamese", "nativeName": "অসমীয়া"},
14+
"av": {"name": "Avaric", "nativeName": "авар мацӀ"},
15+
"ay": {"name": "Aymara", "nativeName": "aymar aru"},
16+
"az": {"name": "Azerbaijani", "nativeName": "azərbaycan dili"},
17+
"ba": {"name": "Bashkir", "nativeName": "башҡорт теле"},
18+
"be": {"name": "Belarusian", "nativeName": "беларуская"},
19+
"bg": {"name": "Bulgarian", "nativeName": "български"},
20+
"bh": {"name": "Bihari", "nativeName": "भोजपुरी"},
21+
"bi": {"name": "Bislama", "nativeName": "Bislama"},
22+
"bm": {"name": "Bambara", "nativeName": "bamanankan"},
23+
"bn": {"name": "Bengali", "nativeName": "বাংলা"},
24+
"bo": {"name": "Tibetan", "nativeName": "བོད་ཡིག"},
25+
"br": {"name": "Breton", "nativeName": "brezhoneg"},
26+
"bs": {"name": "Bosnian", "nativeName": "bosanski"},
27+
"ca": {"name": "Catalan", "nativeName": "català"},
28+
"ce": {"name": "Chechen", "nativeName": "нохчийн"},
29+
"ch": {"name": "Chamorro", "nativeName": "Chamoru"},
30+
"co": {"name": "Corsican", "nativeName": "corsu"},
31+
"cr": {"name": "Cree", "nativeName": "ᓀᐦᐃᔭᐍᐏᐣ"},
32+
"cs": {"name": "Czech", "nativeName": "čeština"},
33+
"cu": {"name": "Church Slavic", "nativeName": "церковнославянский"},
34+
"cv": {"name": "Chuvash", "nativeName": "чӑваш чӗлхи"},
35+
"cy": {"name": "Welsh", "nativeName": "Cymraeg"},
36+
"da": {"name": "Danish", "nativeName": "dansk"},
37+
"de": {"name": "German", "nativeName": "Deutsch"},
38+
"dv": {"name": "Divehi", "nativeName": "ދިވެހި"},
39+
"dz": {"name": "Dzongkha", "nativeName": "རྫོང་ཁ"},
40+
"ee": {"name": "Ewe", "nativeName": "Eʋegbe"},
41+
"el": {"name": "Greek", "nativeName": "Ελληνικά"},
42+
"en": {"name": "English", "nativeName": "English"},
43+
"eo": {"name": "Esperanto", "nativeName": "Esperanto"},
44+
"es": {"name": "Spanish", "nativeName": "español"},
45+
"et": {"name": "Estonian", "nativeName": "eesti"},
46+
"eu": {"name": "Basque", "nativeName": "euskara"},
47+
"fa": {"name": "Persian", "nativeName": "فارسی"},
48+
"ff": {"name": "Fulah", "nativeName": "Fulfulde"},
49+
"fi": {"name": "Finnish", "nativeName": "suomi"},
50+
"fj": {"name": "Fijian", "nativeName": "vosa Vakaviti"},
51+
"fo": {"name": "Faroese", "nativeName": "føroyskt"},
52+
"fr": {"name": "French", "nativeName": "français"},
53+
"fy": {"name": "Western Frisian", "nativeName": "Frysk"},
54+
"ga": {"name": "Irish", "nativeName": "Gaeilge"},
55+
"gd": {"name": "Scottish Gaelic", "nativeName": "Gàidhlig"},
56+
"gl": {"name": "Galician", "nativeName": "galego"},
57+
"gn": {"name": "Guarani", "nativeName": "Avañe'ẽ"},
58+
"gu": {"name": "Gujarati", "nativeName": "ગુજરાતી"},
59+
"gv": {"name": "Manx", "nativeName": "Gaelg"},
60+
"ha": {"name": "Hausa", "nativeName": "Hausa"},
61+
"he": {"name": "Hebrew", "nativeName": "עברית"},
62+
"hi": {"name": "Hindi", "nativeName": "हिन्दी"},
63+
"ho": {"name": "Hiri Motu", "nativeName": "Hiri Motu"},
64+
"hr": {"name": "Croatian", "nativeName": "hrvatski"},
65+
"ht": {"name": "Haitian", "nativeName": "Kreyòl ayisyen"},
66+
"hu": {"name": "Hungarian", "nativeName": "magyar"},
67+
"hy": {"name": "Armenian", "nativeName": "Հայերեն"},
68+
"hz": {"name": "Herero", "nativeName": "Otjiherero"},
69+
"ia": {"name": "Interlingua", "nativeName": "Interlingua"},
70+
"id": {"name": "Indonesian", "nativeName": "Bahasa Indonesia"},
71+
"ie": {"name": "Interlingue", "nativeName": "Interlingue"},
72+
"ig": {"name": "Igbo", "nativeName": "Asụsụ Igbo"},
73+
"ii": {"name": "Sichuan Yi", "nativeName": "ꆇꉙ"},
74+
"ik": {"name": "Inupiaq", "nativeName": "Iñupiaq"},
75+
"io": {"name": "Ido", "nativeName": "Ido"},
76+
"is": {"name": "Icelandic", "nativeName": "íslenska"},
77+
"it": {"name": "Italian", "nativeName": "italiano"},
78+
"iu": {"name": "Inuktitut", "nativeName": "ᐃᓄᒃᑎᑐᑦ"},
79+
"ja": {"name": "Japanese", "nativeName": "日本語"},
80+
"jv": {"name": "Javanese", "nativeName": "ꦧꦱꦗꦮ"},
81+
"ka": {"name": "Georgian", "nativeName": "ქართული"},
82+
"kg": {"name": "Kongo", "nativeName": "Kikongo"},
83+
"ki": {"name": "Kikuyu", "nativeName": "Gĩkũyũ"},
84+
"kj": {"name": "Kuanyama", "nativeName": "Kuanyama"},
85+
"kk": {"name": "Kazakh", "nativeName": "қазақ тілі"},
86+
"kl": {"name": "Kalaallisut", "nativeName": "kalaallisut"},
87+
"km": {"name": "Khmer", "nativeName": "ខ្មែរ"},
88+
"kn": {"name": "Kannada", "nativeName": "ಕನ್ನಡ"},
89+
"ko": {"name": "Korean", "nativeName": "한국어"},
90+
"kr": {"name": "Kanuri", "nativeName": "Kanuri"},
91+
"ks": {"name": "Kashmiri", "nativeName": "कश्मीरी / كشميري"},
92+
"ku": {"name": "Kurdish", "nativeName": "Kurdî"},
93+
"kv": {"name": "Komi", "nativeName": "коми кыв"},
94+
"kw": {"name": "Cornish", "nativeName": "Kernewek"},
95+
"ky": {"name": "Kirghiz", "nativeName": "кыргыз тили"},
96+
"la": {"name": "Latin", "nativeName": "latine"},
97+
"lb": {"name": "Luxembourgish", "nativeName": "Lëtzebuergesch"},
98+
"lg": {"name": "Ganda", "nativeName": "Luganda"},
99+
"li": {"name": "Limburgan", "nativeName": "Limburgs"},
100+
"ln": {"name": "Lingala", "nativeName": "Lingála"},
101+
"lo": {"name": "Lao", "nativeName": "ພາສາລາວ"},
102+
"lt": {"name": "Lithuanian", "nativeName": "lietuvių kalba"},
103+
"lu": {"name": "Luba-Katanga", "nativeName": "Tshiluba"},
104+
"lv": {"name": "Latvian", "nativeName": "latviešu valoda"},
105+
"mg": {"name": "Malagasy", "nativeName": "fiteny malagasy"},
106+
"mh": {"name": "Marshallese", "nativeName": "Kajin M̧ajeļ"},
107+
"mi": {"name": "Māori", "nativeName": "te reo Māori"},
108+
"mk": {"name": "Macedonian", "nativeName": "македонски"},
109+
"ml": {"name": "Malayalam", "nativeName": "മലയാളം"},
110+
"mn": {"name": "Mongolian", "nativeName": "монгол"},
111+
"mr": {"name": "Marathi", "nativeName": "मराठी"},
112+
"ms": {"name": "Malay", "nativeName": "Bahasa Melayu"},
113+
"mt": {"name": "Maltese", "nativeName": "Malti"},
114+
"my": {"name": "Burmese", "nativeName": "ဗမာစာ"},
115+
"na": {"name": "Nauru", "nativeName": "Dorerin Naoero"},
116+
"nb": {"name": "Norwegian Bokmål", "nativeName": "Bokmål"},
117+
"nd": {"name": "North Ndebele", "nativeName": "isiNdebele"},
118+
"ne": {"name": "Nepali", "nativeName": "नेपाली"},
119+
"ng": {"name": "Ndonga", "nativeName": "Owambo"},
120+
"nl": {"name": "Dutch", "nativeName": "Nederlands"},
121+
"nn": {"name": "Norwegian Nynorsk", "nativeName": "Nynorsk"},
122+
"no": {"name": "Norwegian", "nativeName": "Norsk"},
123+
"nr": {"name": "South Ndebele", "nativeName": "isiNdebele"},
124+
"nv": {"name": "Navajo", "nativeName": "Diné bizaad"},
125+
"ny": {"name": "Chichewa", "nativeName": "chiCheŵa"},
126+
"oc": {"name": "Occitan", "nativeName": "occitan"},
127+
"oj": {"name": "Ojibwa", "nativeName": "Anishinaabemowin"},
128+
"om": {"name": "Oromo", "nativeName": "Afaan Oromoo"},
129+
"or": {"name": "Oriya", "nativeName": "ଓଡ଼ିଆ"},
130+
"os": {"name": "Ossetian", "nativeName": "ирон æвзаг"},
131+
"pa": {"name": "Punjabi", "nativeName": "ਪੰਜਾਬੀ"},
132+
"pi": {"name": "Pali", "nativeName": "पाऴि"},
133+
"pl": {"name": "Polish", "nativeName": "polski"},
134+
"ps": {"name": "Pashto", "nativeName": "پښتو"},
135+
"pt": {"name": "Portuguese", "nativeName": "português"},
136+
"qu": {"name": "Quechua", "nativeName": "Runa Simi"},
137+
"rm": {"name": "Romansh", "nativeName": "rumantsch"},
138+
"rn": {"name": "Rundi", "nativeName": "Ikirundi"},
139+
"ro": {"name": "Romanian", "nativeName": "română"},
140+
"ru": {"name": "Russian", "nativeName": "русский"},
141+
"rw": {"name": "Kinyarwanda", "nativeName": "Ikinyarwanda"},
142+
"sa": {"name": "Sanskrit", "nativeName": "संस्कृतम्"},
143+
"sc": {"name": "Sardinian", "nativeName": "sardu"},
144+
"sd": {"name": "Sindhi", "nativeName": "سنڌي"},
145+
"se": {"name": "Northern Sami", "nativeName": "Davvisámegiella"},
146+
"sg": {"name": "Sango", "nativeName": "yângâ tî sängö"},
147+
"si": {"name": "Sinhala", "nativeName": "සිංහල"},
148+
"sk": {"name": "Slovak", "nativeName": "slovenčina"},
149+
"sl": {"name": "Slovenian", "nativeName": "slovenščina"},
150+
"sm": {"name": "Samoan", "nativeName": "gagana fa'a Samoa"},
151+
"sn": {"name": "Shona", "nativeName": "chiShona"},
152+
"so": {"name": "Somali", "nativeName": "Soomaaliga"},
153+
"sq": {"name": "Albanian", "nativeName": "Shqip"},
154+
"sr": {"name": "Serbian", "nativeName": "српски"},
155+
"ss": {"name": "Swati", "nativeName": "SiSwati"},
156+
"st": {"name": "Southern Sotho", "nativeName": "Sesotho"},
157+
"su": {"name": "Sundanese", "nativeName": "Basa Sunda"},
158+
"sv": {"name": "Swedish", "nativeName": "svenska"},
159+
"sw": {"name": "Swahili", "nativeName": "Kiswahili"},
160+
"ta": {"name": "Tamil", "nativeName": "தமிழ்"},
161+
"te": {"name": "Telugu", "nativeName": "తెలుగు"},
162+
"tg": {"name": "Tajik", "nativeName": "тоҷикӣ"},
163+
"th": {"name": "Thai", "nativeName": "ไทย"},
164+
"ti": {"name": "Tigrinya", "nativeName": "ትግርኛ"},
165+
"tk": {"name": "Turkmen", "nativeName": "türkmen dili"},
166+
"tl": {"name": "Tagalog", "nativeName": "Wikang Tagalog"},
167+
"tn": {"name": "Tswana", "nativeName": "Setswana"},
168+
"to": {"name": "Tonga", "nativeName": "faka Tonga"},
169+
"tr": {"name": "Turkish", "nativeName": "Türkçe"},
170+
"ts": {"name": "Tsonga", "nativeName": "Xitsonga"},
171+
"tt": {"name": "Tatar", "nativeName": "татарча"},
172+
"tw": {"name": "Twi", "nativeName": "Twi"},
173+
"ty": {"name": "Tahitian", "nativeName": "Reo Tahiti"},
174+
"ug": {"name": "Uighur", "nativeName": "ئۇيغۇرچە"},
175+
"uk": {"name": "Ukrainian", "nativeName": "українська"},
176+
"ur": {"name": "Urdu", "nativeName": "اردو"},
177+
"uz": {"name": "Uzbek", "nativeName": "o‘zbek tili"},
178+
"ve": {"name": "Venda", "nativeName": "Tshivenda"},
179+
"vi": {"name": "Vietnamese", "nativeName": "Tiếng Việt"},
180+
"vo": {"name": "Volapük", "nativeName": "Volapük"},
181+
"wa": {"name": "Walloon", "nativeName": "walon"},
182+
"wo": {"name": "Wolof", "nativeName": "Wollof"},
183+
"xh": {"name": "Xhosa", "nativeName": "isiXhosa"},
184+
"yi": {"name": "Yiddish", "nativeName": "ייִדיש"},
185+
"yo": {"name": "Yoruba", "nativeName": "Yorùbá"},
186+
"za": {"name": "Zhuang", "nativeName": "Saɯ cueŋƅ"},
187+
"zh": {"name": "Chinese", "nativeName": "中文"},
188+
"zu": {"name": "Zulu", "nativeName": "isiZulu"},
189+
}
190+

youtubevideotranscriptbot/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,6 @@ aiofiles>=0.4.0
1111
amplitude-analytics==1.1.5
1212
supadata==1.2.1
1313
python-dotenv
14+
weasyprint>=60.2
1415

1516

youtubevideotranscriptbot/run_bot.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,4 @@ docker run -d --name ${CONTAINER_NAME} \
2424
--restart unless-stopped \
2525
${IMAGE_TAG}
2626

27-
echo "Container started successfully!"
27+
echo "Container started successfully!"

youtubevideotranscriptbot/summarize.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def _summarize_sync(chunk, language, model=MODEL_TO_USE):
4747
system_role = "You are a master of extracting pearls of knowledge from YouTube video transcripts. You grasp the very essence and distill it in a concise form for users. You always provide the response in the same language in which the transcript is provided. Your answers are always clear, concise and nicely formatted."
4848
prompt = (
4949
f"If I did not have time to read this YouTube video transcript, what are the most important things I absolutely must know. Enlighten me in no more than 200 words. "
50-
f"Always provide your response in the same language as the transcript. In this case it might be in '{language}' language. Here is the transcript itself:\n\n{chunk}"
50+
f"Always provide your response in the same language as the transcript. In this case it might be in '{language}' language. Use emojis to structure the summary if and when appropriate. Never reference this prompt. Get straight to the point without intros. Here is the transcript itself:\n\n{chunk}"
5151
)
5252
# prompt = (
5353
# f"If I did not have time to read this YouTube video transcript, what are the most important things I absolutely must know. Enlighten me in no more than 200 words. "

0 commit comments

Comments
 (0)