You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

86 lines
3.0 KiB

  1. import datetime
  2. import re
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. ExtractorError,
  6. )
  7. class GooglePlusIE(InfoExtractor):
  8. """Information extractor for plus.google.com."""
  9. _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
  10. IE_NAME = u'plus.google'
  11. def _real_extract(self, url):
  12. # Extract id from URL
  13. mobj = re.match(self._VALID_URL, url)
  14. if mobj is None:
  15. raise ExtractorError(u'Invalid URL: %s' % url)
  16. post_url = mobj.group(0)
  17. video_id = mobj.group(1)
  18. video_extension = 'flv'
  19. # Step 1, Retrieve post webpage to extract further information
  20. webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
  21. self.report_extraction(video_id)
  22. # Extract update date
  23. upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
  24. webpage, u'upload date', fatal=False)
  25. if upload_date:
  26. # Convert timestring to a format suitable for filename
  27. upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
  28. upload_date = upload_date.strftime('%Y%m%d')
  29. # Extract uploader
  30. uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
  31. webpage, u'uploader', fatal=False)
  32. # Extract title
  33. # Get the first line for title
  34. video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
  35. webpage, 'title', default=u'NA')
  36. # Step 2, Simulate clicking the image box to launch video
  37. DOMAIN = 'https://plus.google.com'
  38. video_page = self._search_regex(r'<a href="((?:%s)?/photos/.*?)"' % re.escape(DOMAIN),
  39. webpage, u'video page URL')
  40. if not video_page.startswith(DOMAIN):
  41. video_page = DOMAIN + video_page
  42. webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
  43. # Extract video links on video page
  44. """Extract video links of all sizes"""
  45. pattern = r'\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
  46. mobj = re.findall(pattern, webpage)
  47. if len(mobj) == 0:
  48. raise ExtractorError(u'Unable to extract video links')
  49. # Sort in resolution
  50. links = sorted(mobj)
  51. # Choose the lowest of the sort, i.e. highest resolution
  52. video_url = links[-1]
  53. # Only get the url. The resolution part in the tuple has no use anymore
  54. video_url = video_url[-1]
  55. # Treat escaped \u0026 style hex
  56. try:
  57. video_url = video_url.decode("unicode_escape")
  58. except AttributeError: # Python 3
  59. video_url = bytes(video_url, 'ascii').decode('unicode-escape')
  60. return [{
  61. 'id': video_id,
  62. 'url': video_url,
  63. 'uploader': uploader,
  64. 'upload_date': upload_date,
  65. 'title': video_title,
  66. 'ext': video_extension,
  67. }]