You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

58 lines
2.1 KiB

11 years ago
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. from .common import InfoExtractor
  5. from ..utils import unified_strdate
  6. class ElPaisIE(InfoExtractor):
  7. _VALID_URL = r'https?://(?:[^.]+\.)?elpais\.com/.*/(?P<id>[^/#?]+)\.html(?:$|[?#])'
  8. IE_DESC = 'El País'
  9. _TEST = {
  10. 'url': 'http://blogs.elpais.com/la-voz-de-inaki/2014/02/tiempo-nuevo-recetas-viejas.html',
  11. 'md5': '98406f301f19562170ec071b83433d55',
  12. 'info_dict': {
  13. 'id': 'tiempo-nuevo-recetas-viejas',
  14. 'ext': 'mp4',
  15. 'title': 'Tiempo nuevo, recetas viejas',
  16. 'description': 'De lunes a viernes, a partir de las ocho de la mañana, Iñaki Gabilondo nos cuenta su visión de la actualidad nacional e internacional.',
  17. 'upload_date': '20140206',
  18. }
  19. }
  20. def _real_extract(self, url):
  21. mobj = re.match(self._VALID_URL, url)
  22. video_id = mobj.group('id')
  23. webpage = self._download_webpage(url, video_id)
  24. prefix = self._html_search_regex(
  25. r'var url_cache = "([^"]+)";', webpage, 'URL prefix')
  26. video_suffix = self._search_regex(
  27. r"URLMediaFile = url_cache \+ '([^']+)'", webpage, 'video URL')
  28. video_url = prefix + video_suffix
  29. thumbnail_suffix = self._search_regex(
  30. r"URLMediaStill = url_cache \+ '([^']+)'", webpage, 'thumbnail URL',
  31. fatal=False)
  32. thumbnail = (
  33. None if thumbnail_suffix is None
  34. else prefix + thumbnail_suffix)
  35. title = self._html_search_regex(
  36. '<h2 class="entry-header entry-title.*?>(.*?)</h2>',
  37. webpage, 'title')
  38. date_str = self._search_regex(
  39. r'<p class="date-header date-int updated"\s+title="([^"]+)">',
  40. webpage, 'upload date', fatal=False)
  41. upload_date = (None if date_str is None else unified_strdate(date_str))
  42. return {
  43. 'id': video_id,
  44. 'url': video_url,
  45. 'title': title,
  46. 'description': self._og_search_description(webpage),
  47. 'thumbnail': thumbnail,
  48. 'upload_date': upload_date,
  49. }