You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

120 lines
4.3 KiB

  1. import json
  2. import netrc
  3. import re
  4. import socket
  5. from .common import InfoExtractor
  6. from ..utils import (
  7. compat_http_client,
  8. compat_str,
  9. compat_urllib_error,
  10. compat_urllib_parse,
  11. compat_urllib_request,
  12. ExtractorError,
  13. )
  14. class FacebookIE(InfoExtractor):
  15. """Information Extractor for Facebook"""
  16. _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
  17. _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
  18. _NETRC_MACHINE = 'facebook'
  19. IE_NAME = u'facebook'
  20. _TEST = {
  21. u'url': u'https://www.facebook.com/photo.php?v=120708114770723',
  22. u'file': u'120708114770723.mp4',
  23. u'md5': u'48975a41ccc4b7a581abd68651c1a5a8',
  24. u'info_dict': {
  25. u"duration": 279,
  26. u"title": u"PEOPLE ARE AWESOME 2013"
  27. }
  28. }
  29. def report_login(self):
  30. """Report attempt to log in."""
  31. self.to_screen(u'Logging in')
  32. def _real_initialize(self):
  33. if self._downloader is None:
  34. return
  35. useremail = None
  36. password = None
  37. downloader_params = self._downloader.params
  38. # Attempt to use provided username and password or .netrc data
  39. if downloader_params.get('username', None) is not None:
  40. useremail = downloader_params['username']
  41. password = downloader_params['password']
  42. elif downloader_params.get('usenetrc', False):
  43. try:
  44. info = netrc.netrc().authenticators(self._NETRC_MACHINE)
  45. if info is not None:
  46. useremail = info[0]
  47. password = info[2]
  48. else:
  49. raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
  50. except (IOError, netrc.NetrcParseError) as err:
  51. self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
  52. return
  53. if useremail is None:
  54. return
  55. # Log in
  56. login_form = {
  57. 'email': useremail,
  58. 'pass': password,
  59. 'login': 'Log+In'
  60. }
  61. request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
  62. try:
  63. self.report_login()
  64. login_results = compat_urllib_request.urlopen(request).read()
  65. if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
  66. self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
  67. return
  68. except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  69. self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
  70. return
  71. def _real_extract(self, url):
  72. mobj = re.match(self._VALID_URL, url)
  73. if mobj is None:
  74. raise ExtractorError(u'Invalid URL: %s' % url)
  75. video_id = mobj.group('ID')
  76. url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
  77. webpage = self._download_webpage(url, video_id)
  78. BEFORE = '{swf.addParam(param[0], param[1]);});\n'
  79. AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
  80. m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
  81. if not m:
  82. raise ExtractorError(u'Cannot parse data')
  83. data = dict(json.loads(m.group(1)))
  84. params_raw = compat_urllib_parse.unquote(data['params'])
  85. params = json.loads(params_raw)
  86. video_data = params['video_data'][0]
  87. video_url = video_data.get('hd_src')
  88. if not video_url:
  89. video_url = video_data['sd_src']
  90. if not video_url:
  91. raise ExtractorError(u'Cannot find video URL')
  92. video_duration = int(video_data['video_duration'])
  93. thumbnail = video_data['thumbnail_src']
  94. video_title = self._html_search_regex(
  95. r'<h2 class="uiHeaderTitle">([^<]*)</h2>', webpage, u'title')
  96. info = {
  97. 'id': video_id,
  98. 'title': video_title,
  99. 'url': video_url,
  100. 'ext': 'mp4',
  101. 'duration': video_duration,
  102. 'thumbnail': thumbnail,
  103. }
  104. return [info]