You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

150 lines
5.4 KiB

  1. from __future__ import unicode_literals
  2. import re
  3. from .utils import (
  4. ExtractorError,
  5. )
  6. class JSInterpreter(object):
  7. def __init__(self, code):
  8. self.code = code
  9. self._functions = {}
  10. self._objects = {}
  11. def interpret_statement(self, stmt, local_vars, allow_recursion=20):
  12. if allow_recursion < 0:
  13. raise ExtractorError('Recursion limit reached')
  14. if stmt.startswith('var '):
  15. stmt = stmt[len('var '):]
  16. ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
  17. r'=(?P<expr>.*)$', stmt)
  18. if ass_m:
  19. if ass_m.groupdict().get('index'):
  20. def assign(val):
  21. lvar = local_vars[ass_m.group('out')]
  22. idx = self.interpret_expression(
  23. ass_m.group('index'), local_vars, allow_recursion)
  24. assert isinstance(idx, int)
  25. lvar[idx] = val
  26. return val
  27. expr = ass_m.group('expr')
  28. else:
  29. def assign(val):
  30. local_vars[ass_m.group('out')] = val
  31. return val
  32. expr = ass_m.group('expr')
  33. elif stmt.startswith('return '):
  34. assign = lambda v: v
  35. expr = stmt[len('return '):]
  36. else:
  37. raise ExtractorError(
  38. 'Cannot determine left side of statement in %r' % stmt)
  39. v = self.interpret_expression(expr, local_vars, allow_recursion)
  40. return assign(v)
  41. def interpret_expression(self, expr, local_vars, allow_recursion):
  42. if expr.isdigit():
  43. return int(expr)
  44. if expr.isalpha():
  45. return local_vars[expr]
  46. m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
  47. if m:
  48. member = m.group('member')
  49. variable = m.group('in')
  50. if variable not in local_vars:
  51. if variable not in self._objects:
  52. self._objects[variable] = self.extract_object(variable)
  53. obj = self._objects[variable]
  54. key, args = member.split('(', 1)
  55. args = args.strip(')')
  56. argvals = [int(v) if v.isdigit() else local_vars[v]
  57. for v in args.split(',')]
  58. return obj[key](argvals)
  59. val = local_vars[variable]
  60. if member == 'split("")':
  61. return list(val)
  62. if member == 'join("")':
  63. return ''.join(val)
  64. if member == 'length':
  65. return len(val)
  66. if member == 'reverse()':
  67. return val[::-1]
  68. slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
  69. if slice_m:
  70. idx = self.interpret_expression(
  71. slice_m.group('idx'), local_vars, allow_recursion - 1)
  72. return val[idx:]
  73. m = re.match(
  74. r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
  75. if m:
  76. val = local_vars[m.group('in')]
  77. idx = self.interpret_expression(
  78. m.group('idx'), local_vars, allow_recursion - 1)
  79. return val[idx]
  80. m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
  81. if m:
  82. a = self.interpret_expression(
  83. m.group('a'), local_vars, allow_recursion)
  84. b = self.interpret_expression(
  85. m.group('b'), local_vars, allow_recursion)
  86. return a % b
  87. m = re.match(
  88. r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
  89. if m:
  90. fname = m.group('func')
  91. if fname not in self._functions:
  92. self._functions[fname] = self.extract_function(fname)
  93. argvals = [int(v) if v.isdigit() else local_vars[v]
  94. for v in m.group('args').split(',')]
  95. return self._functions[fname](argvals)
  96. raise ExtractorError('Unsupported JS expression %r' % expr)
  97. def extract_object(self, objname):
  98. obj = {}
  99. obj_m = re.search(
  100. (r'(?:var\s+)?%s\s*=\s*\{' % re.escape(objname)) +
  101. r'\s*(?P<fields>([a-zA-Z$]+\s*:\s*function\(.*?\)\s*\{.*?\})*)' +
  102. r'\}\s*;',
  103. self.code)
  104. fields = obj_m.group('fields')
  105. # Currently, it only supports function definitions
  106. fields_m = re.finditer(
  107. r'(?P<key>[a-zA-Z$]+)\s*:\s*function'
  108. r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
  109. fields)
  110. for f in fields_m:
  111. argnames = f.group('args').split(',')
  112. obj[f.group('key')] = self.build_function(argnames, f.group('code'))
  113. return obj
  114. def extract_function(self, funcname):
  115. func_m = re.search(
  116. (r'(?:function %s|[{;]%s\s*=\s*function)' % (
  117. re.escape(funcname), re.escape(funcname))) +
  118. r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
  119. self.code)
  120. if func_m is None:
  121. raise ExtractorError('Could not find JS function %r' % funcname)
  122. argnames = func_m.group('args').split(',')
  123. return self.build_function(argnames, func_m.group('code'))
  124. def build_function(self, argnames, code):
  125. def resf(args):
  126. local_vars = dict(zip(argnames, args))
  127. for stmt in code.split(';'):
  128. res = self.interpret_statement(stmt, local_vars)
  129. return res
  130. return resf