Browse Source

[youtube] Improve tags extraction and add test

totalwebcasting
Sergey M․ 10 years ago
parent
commit
000b6b5ae5
1 changed files with 10 additions and 4 deletions
  1. +10
    -4
      youtube_dl/extractor/youtube.py

+ 10
- 4
youtube_dl/extractor/youtube.py View File

@ -329,6 +329,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'upload_date': '20121002', 'upload_date': '20121002',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'], 'categories': ['Science & Technology'],
'tags': ['youtube-dl'],
'like_count': int, 'like_count': int,
'dislike_count': int, 'dislike_count': int,
'start_time': 1, 'start_time': 1,
@ -343,7 +344,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'upload_date': '20120506', 'upload_date': '20120506',
'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
'description': 'md5:782e8651347686cba06e58f71ab51773',
'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
'iconic ep', 'iconic', 'love', 'it'],
'uploader': 'Icona Pop', 'uploader': 'Icona Pop',
'uploader_id': 'IconaPop', 'uploader_id': 'IconaPop',
} }
@ -1072,8 +1076,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else: else:
video_categories = None video_categories = None
video_tags = re.findall(r'''<meta(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?\s+property=['"]?og:video:tag['"]?(?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?\s+content=['"]?([^>'"]+?)['"]?\s*>'''
, video_webpage, re.DOTALL | re.IGNORECASE);
video_tags = [
unescapeHTML(m.group('content'))
for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
# description # description
video_description = get_element_by_id("eow-description", video_webpage) video_description = get_element_by_id("eow-description", video_webpage)
if video_description: if video_description:
@ -1261,8 +1267,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': video_title, 'title': video_title,
'thumbnail': video_thumbnail, 'thumbnail': video_thumbnail,
'description': video_description, 'description': video_description,
'tags' : video_tags,
'categories': video_categories, 'categories': video_categories,
'tags': video_tags,
'subtitles': video_subtitles, 'subtitles': video_subtitles,
'automatic_captions': automatic_captions, 'automatic_captions': automatic_captions,
'duration': video_duration, 'duration': video_duration,


Loading…
Cancel
Save