You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

133 lines
5.5 KiB

  1. import json
  2. import netrc
  3. import re
  4. import socket
  5. from .common import InfoExtractor
  6. from ..utils import (
  7. compat_http_client,
  8. compat_str,
  9. compat_urllib_error,
  10. compat_urllib_parse,
  11. compat_urllib_request,
  12. ExtractorError,
  13. )
  14. class FacebookIE(InfoExtractor):
  15. """Information Extractor for Facebook"""
  16. _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
  17. _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
  18. _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
  19. _NETRC_MACHINE = 'facebook'
  20. IE_NAME = u'facebook'
  21. _TEST = {
  22. u'url': u'https://www.facebook.com/photo.php?v=120708114770723',
  23. u'file': u'120708114770723.mp4',
  24. u'md5': u'48975a41ccc4b7a581abd68651c1a5a8',
  25. u'info_dict': {
  26. u"duration": 279,
  27. u"title": u"PEOPLE ARE AWESOME 2013"
  28. }
  29. }
  30. def report_login(self):
  31. """Report attempt to log in."""
  32. self.to_screen(u'Logging in')
  33. def _login(self):
  34. (useremail, password) = self._get_login_info()
  35. if useremail is None:
  36. return
  37. login_page_req = compat_urllib_request.Request(self._LOGIN_URL)
  38. login_page_req.add_header('Cookie', 'locale=en_US')
  39. self.report_login()
  40. login_page = self._download_webpage(login_page_req, None, note=False,
  41. errnote=u'Unable to download login page')
  42. lsd = self._search_regex(r'"lsd":"(\w*?)"', login_page, u'lsd')
  43. lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, u'lgnrnd')
  44. login_form = {
  45. 'email': useremail,
  46. 'pass': password,
  47. 'lsd': lsd,
  48. 'lgnrnd': lgnrnd,
  49. 'next': 'http://facebook.com/home.php',
  50. 'default_persistent': '0',
  51. 'legacy_return': '1',
  52. 'timezone': '-60',
  53. 'trynum': '1',
  54. }
  55. request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
  56. request.add_header('Content-Type', 'application/x-www-form-urlencoded')
  57. try:
  58. login_results = compat_urllib_request.urlopen(request).read()
  59. if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
  60. self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
  61. return
  62. check_form = {
  63. 'fb_dtsg': self._search_regex(r'"fb_dtsg":"(.*?)"', login_results, u'fb_dtsg'),
  64. 'nh': self._search_regex(r'name="nh" value="(\w*?)"', login_results, u'nh'),
  65. 'name_action_selected': 'dont_save',
  66. 'submit[Continue]': self._search_regex(r'<input value="(.*?)" name="submit\[Continue\]"', login_results, u'continue'),
  67. }
  68. check_req = compat_urllib_request.Request(self._CHECKPOINT_URL, compat_urllib_parse.urlencode(check_form))
  69. check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
  70. check_response = compat_urllib_request.urlopen(check_req).read()
  71. if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
  72. self._downloader.report_warning(u'Unable to confirm login, you have to login in your brower and authorize the login.')
  73. except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  74. self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
  75. return
  76. def _real_initialize(self):
  77. self._login()
  78. def _real_extract(self, url):
  79. mobj = re.match(self._VALID_URL, url)
  80. if mobj is None:
  81. raise ExtractorError(u'Invalid URL: %s' % url)
  82. video_id = mobj.group('ID')
  83. url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
  84. webpage = self._download_webpage(url, video_id)
  85. BEFORE = '{swf.addParam(param[0], param[1]);});\n'
  86. AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
  87. m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
  88. if not m:
  89. m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
  90. if m_msg is not None:
  91. raise ExtractorError(
  92. u'The video is not available, Facebook said: "%s"' % m_msg.group(1),
  93. expected=True)
  94. else:
  95. raise ExtractorError(u'Cannot parse data')
  96. data = dict(json.loads(m.group(1)))
  97. params_raw = compat_urllib_parse.unquote(data['params'])
  98. params = json.loads(params_raw)
  99. video_data = params['video_data'][0]
  100. video_url = video_data.get('hd_src')
  101. if not video_url:
  102. video_url = video_data['sd_src']
  103. if not video_url:
  104. raise ExtractorError(u'Cannot find video URL')
  105. video_duration = int(video_data['video_duration'])
  106. thumbnail = video_data['thumbnail_src']
  107. video_title = self._html_search_regex(
  108. r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, u'title')
  109. info = {
  110. 'id': video_id,
  111. 'title': video_title,
  112. 'url': video_url,
  113. 'ext': 'mp4',
  114. 'duration': video_duration,
  115. 'thumbnail': thumbnail,
  116. }
  117. return [info]