Browse Source

[moviefap] Add new extractor

totalwebcasting
George Brighton 10 years ago
parent
commit
82ea1051b5
2 changed files with 118 additions and 0 deletions
  1. +1
    -0
      youtube_dl/extractor/__init__.py
  2. +117
    -0
      youtube_dl/extractor/moviefap.py

+ 1
- 0
youtube_dl/extractor/__init__.py View File

@ -311,6 +311,7 @@ from .morningstar import MorningstarIE
from .motherless import MotherlessIE
from .motorsport import MotorsportIE
from .movieclips import MovieClipsIE
from .moviefap import MovieFapIE
from .moviezine import MoviezineIE
from .movshare import MovShareIE
from .mtv import (


+ 117
- 0
youtube_dl/extractor/moviefap.py View File

@ -0,0 +1,117 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import str_to_int
class MovieFapIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<name>[a-z-_]+)'
_TESTS = [{
'url': 'http://www.moviefap.com/videos/e5da0d3edce5404418f5/jeune-couple-russe.html',
'md5': 'fa56683e291fc80635907168a743c9ad',
'info_dict': {
'id': 'e5da0d3edce5404418f5',
'ext': 'flv',
'title': 'Jeune Couple Russe',
'description': 'Amateur',
'thumbnail': 'http://pic.moviefap.com/thumbs/e5/949-18l.jpg',
'uploader_id': 'whiskeyjar',
'display_id': 'jeune-couple-russe'
}
}, {
'url': 'http://www.moviefap.com/videos/3080837f6712355015c2/busty-british-blonde-takes-backdoor-in-fake-taxi.html',
'md5': 'bedef72cb23d27a20755fc430a6d7a0e',
'info_dict': {
'id': '3080837f6712355015c2',
'ext': 'mp4',
'title': 'Busty British blonde takes backdoor in fake taxi',
'description': 'Big boobs British blonde flashing in fake taxi then giving titsjob and rimjob in the back seat before getting big cock up her tight ass',
'thumbnail': 'http://img.moviefap.com/a16:9w990r/thumbs/30/322021-18l.jpg',
'uploader_id': 'momcikoper',
'display_id': 'busty-british-blonde-takes-backdoor-in-fake-taxi'
}
}]
@staticmethod
def __get_thumbnail_data(xml):
"""
Constructs a list of video thumbnails from timeline preview images.
:param xml: the information XML document to parse
"""
timeline = xml.find('timeline')
if timeline is None:
# not all videos have the data - ah well
return []
# get the required information from the XML
attrs = {attr: str_to_int(timeline.find(attr).text)
for attr in ['imageWidth', 'imageHeight', 'imageFirst', 'imageLast']}
pattern = timeline.find('imagePattern').text
# generate the list of thumbnail information dicts
thumbnails = []
for i in range(attrs['imageFirst'], attrs['imageLast'] + 1):
thumbnails.append({
'url': pattern.replace('#', str(i)),
'width': attrs['imageWidth'],
'height': attrs['imageHeight']
})
return thumbnails
def _real_extract(self, url):
# find the video ID
video_id = self._match_id(url)
# retrieve the page HTML
webpage = self._download_webpage(url, video_id)
# find the URL of the XML document detailing video download URLs
info_url = self._html_search_regex(r'flashvars\.config = escape\("(.+?)"', webpage, 'player parameters')
# download that XML
xml = self._download_xml(info_url, video_id)
# create dictionary of properties we know so far, or can find easily
info = {
'id': video_id,
'title': self._html_search_regex(r'<div id="view_title"><h1>(.*?)</h1>', webpage, 'title'),
'display_id': re.compile(self._VALID_URL).match(url).group('name'),
'thumbnails': self.__get_thumbnail_data(xml),
'thumbnail': xml.find('startThumb').text,
'description': self._html_search_regex(r'name="description" value="(.*?)"', webpage, 'description'),
'uploader_id': self._html_search_regex(r'name="username" value="(.*?)"', webpage, 'uploader_id'),
'view_count': str_to_int(self._html_search_regex(r'<br>Views <strong>([0-9]+)</strong>', webpage, 'view_count')),
'average_rating': float(self._html_search_regex(r'Current Rating<br> <strong>(.*?)</strong>', webpage, 'average_rating')),
'comment_count': str_to_int(self._html_search_regex(r'<span id="comCount">([0-9]+)</span>', webpage, 'comment_count')),
'age_limit': 18,
'webpage_url': self._html_search_regex(r'name="link" value="(.*?)"', webpage, 'webpage_url'),
'categories': self._html_search_regex(r'</div>\s*(.*?)\s*<br>', webpage, 'categories').split(', ')
}
# find and add the format
if xml.find('videoConfig') is not None:
info['ext'] = xml.find('videoConfig').find('type').text
else:
info['ext'] = 'flv' # guess...
# work out the video URL(s)
if xml.find('videoLink') is not None:
# single format available
info['url'] = xml.find('videoLink').text
else:
# multiple formats available
info['formats'] = []
# N.B. formats are already in ascending order of quality
for item in xml.find('quality').findall('item'):
info['formats'].append({
'url': item.find('videoLink').text,
'resolution': item.find('res').text # 480p etc.
})
return info

Loading…
Cancel
Save