Browse Source

[EsriVideo] Add new extractor

Add extractor for [videos.esri.com](https://videos.esri.com), a collection
of videos relating to GIS.
totalwebcasting
Shaun Walbridge 10 years ago
committed by Sergey M․
parent
commit
8b8c1093b6
2 changed files with 91 additions and 0 deletions
  1. +1
    -0
      youtube_dl/extractor/__init__.py
  2. +90
    -0
      youtube_dl/extractor/videoesri.py

+ 1
- 0
youtube_dl/extractor/__init__.py View File

@ -695,6 +695,7 @@ from .vice import ViceIE
from .viddler import ViddlerIE
from .videobam import VideoBamIE
from .videodetective import VideoDetectiveIE
from .videoesri import VideoEsriIE
from .videolecturesnet import VideoLecturesNetIE
from .videofyme import VideofyMeIE
from .videomega import VideoMegaIE


+ 90
- 0
youtube_dl/extractor/videoesri.py View File

@ -0,0 +1,90 @@
# coding: utf-8
from __future__ import unicode_literals
import os
import re
from .common import InfoExtractor
from ..utils import (
unified_strdate
)
class VideoEsriIE(InfoExtractor):
_VALID_URL = r'https?://video\.esri\.com/watch/(?P<id>[0-9]+)'
_TEST = {
'url': 'https://video.esri.com/watch/4228',
'md5': '170b4d513c2466ed483c150a48384133',
'info_dict': {
'id': '4228',
'ext': 'mp4',
'title': 'AppStudio for ArcGIS',
'thumbnail': 're:^https?://.*\.jpg$',
'upload_date': '20150310',
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
upload_date_raw = self._search_regex(
r'http-equiv="last-modified" content="(.*)"',
webpage, 'upload date')
upload_date = unified_strdate(upload_date_raw)
settings_info = self._search_regex(
r'evPlayerSettings = {(.*?);\s*$',
webpage, 'settings info', flags=re.MULTILINE | re.DOTALL)
# thumbnail includes '_x' for large, also has {_m,_t,_s} or
# without size suffix returns full image
thumbnail_path = re.findall(
r'image\': \'(\/thumbs.*)\'',
settings_info)[0]
if thumbnail_path:
thumbnail = '/'.join(['http://video.esri.com', thumbnail_path])
# note that this misses the (exceedly rare) webm files
video_paths = re.findall(r'mp4:(.*)\'', settings_info)
# find possible http servers of the mp4 files (also has rtsp)
base_url = re.findall(
r'netstreambasepath\':\s\'(h.*)\'', settings_info)[0]
# these are the numbers used internally, but really map
# to other resolutions, e.g. 960 is 720p.
heights = [480, 720, 960]
videos_by_res = {}
for video_path in video_paths:
url = "{base_url}{video_path}".format(
base_url=base_url,
video_path=video_path)
filename, ext = os.path.splitext(video_path)
height_label = int(filename.split('_')[1])
videos_by_res[height_label] = {
'url': url,
'ext': ext[1:],
'protocol': 'http', # http-only supported currently
}
formats = []
for height in heights:
if height in videos_by_res:
formats.append(videos_by_res[height])
result = {
'id': video_id,
'title': title,
'upload_date': upload_date,
'formats': formats,
}
if thumbnail:
result['thumbnail'] = thumbnail
return result

Loading…
Cancel
Save