1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
|
import collections
import contextlib
import json
import os
import subprocess
import tempfile
import urllib.parse
from ..utils import (
ExtractorError,
Popen,
check_executable,
format_field,
get_exe_version,
is_outdated_version,
shell_quote,
)
def cookie_to_dict(cookie):
cookie_dict = {
'name': cookie.name,
'value': cookie.value,
}
if cookie.port_specified:
cookie_dict['port'] = cookie.port
if cookie.domain_specified:
cookie_dict['domain'] = cookie.domain
if cookie.path_specified:
cookie_dict['path'] = cookie.path
if cookie.expires is not None:
cookie_dict['expires'] = cookie.expires
if cookie.secure is not None:
cookie_dict['secure'] = cookie.secure
if cookie.discard is not None:
cookie_dict['discard'] = cookie.discard
with contextlib.suppress(TypeError):
if (cookie.has_nonstandard_attr('httpOnly')
or cookie.has_nonstandard_attr('httponly')
or cookie.has_nonstandard_attr('HttpOnly')):
cookie_dict['httponly'] = True
return cookie_dict
def cookie_jar_to_list(cookie_jar):
return [cookie_to_dict(cookie) for cookie in cookie_jar]
class PhantomJSwrapper:
"""PhantomJS wrapper class
This class is experimental.
"""
INSTALL_HINT = 'Please download it from https://phantomjs.org/download.html'
_BASE_JS = R'''
phantom.onError = function(msg, trace) {{
var msgStack = ['PHANTOM ERROR: ' + msg];
if(trace && trace.length) {{
msgStack.push('TRACE:');
trace.forEach(function(t) {{
msgStack.push(' -> ' + (t.file || t.sourceURL) + ': ' + t.line
+ (t.function ? ' (in function ' + t.function +')' : ''));
}});
}}
console.error(msgStack.join('\n'));
phantom.exit(1);
}};
'''
_TEMPLATE = R'''
var page = require('webpage').create();
var fs = require('fs');
var read = {{ mode: 'r', charset: 'utf-8' }};
var write = {{ mode: 'w', charset: 'utf-8' }};
JSON.parse(fs.read("{cookies}", read)).forEach(function(x) {{
phantom.addCookie(x);
}});
page.settings.resourceTimeout = {timeout};
page.settings.userAgent = "{ua}";
page.onLoadStarted = function() {{
page.evaluate(function() {{
delete window._phantom;
delete window.callPhantom;
}});
}};
var saveAndExit = function() {{
fs.write("{html}", page.content, write);
fs.write("{cookies}", JSON.stringify(phantom.cookies), write);
phantom.exit();
}};
page.onLoadFinished = function(status) {{
if(page.url === "") {{
page.setContent(fs.read("{html}", read), "{url}");
}}
else {{
{jscode}
}}
}};
page.open("");
'''
_TMP_FILE_NAMES = ['script', 'html', 'cookies']
@staticmethod
def _version():
return get_exe_version('phantomjs', version_re=r'([0-9.]+)')
def __init__(self, extractor, required_version=None, timeout=10000):
self._TMP_FILES = {}
self.exe = check_executable('phantomjs', ['-v'])
if not self.exe:
raise ExtractorError(f'PhantomJS not found, {self.INSTALL_HINT}', expected=True)
self.extractor = extractor
if required_version:
version = self._version()
if is_outdated_version(version, required_version):
self.extractor._downloader.report_warning(
'Your copy of PhantomJS is outdated, update it to version '
f'{required_version} or newer if you encounter any errors.')
for name in self._TMP_FILE_NAMES:
tmp = tempfile.NamedTemporaryFile(delete=False)
tmp.close()
self._TMP_FILES[name] = tmp
self.options = collections.ChainMap({
'timeout': timeout,
}, {
x: self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"')
for x in self._TMP_FILE_NAMES
})
def __del__(self):
for name in self._TMP_FILE_NAMES:
with contextlib.suppress(OSError, KeyError):
os.remove(self._TMP_FILES[name].name)
def _save_cookies(self, url):
cookies = cookie_jar_to_list(self.extractor.cookiejar)
for cookie in cookies:
if 'path' not in cookie:
cookie['path'] = '/'
if 'domain' not in cookie:
cookie['domain'] = urllib.parse.urlparse(url).netloc
with open(self._TMP_FILES['cookies'].name, 'wb') as f:
f.write(json.dumps(cookies).encode())
def _load_cookies(self):
with open(self._TMP_FILES['cookies'].name, 'rb') as f:
cookies = json.loads(f.read().decode('utf-8'))
for cookie in cookies:
if cookie['httponly'] is True:
cookie['rest'] = {'httpOnly': None}
if 'expiry' in cookie:
cookie['expire_time'] = cookie['expiry']
self.extractor._set_cookie(**cookie)
def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'):
"""
Downloads webpage (if needed) and executes JS
Params:
url: website url
html: optional, html code of website
video_id: video id
note: optional, displayed when downloading webpage
note2: optional, displayed when executing JS
headers: custom http headers
jscode: code to be executed when page is loaded
Returns tuple with:
* downloaded website (after JS execution)
* anything you print with `console.log` (but not inside `page.execute`!)
In most cases you don't need to add any `jscode`.
It is executed in `page.onLoadFinished`.
`saveAndExit();` is mandatory, use it instead of `phantom.exit()`
It is possible to wait for some element on the webpage, e.g.
var check = function() {
var elementFound = page.evaluate(function() {
return document.querySelector('#b.done') !== null;
});
if(elementFound)
saveAndExit();
else
window.setTimeout(check, 500);
}
page.evaluate(function(){
document.querySelector('#a').click();
});
check();
"""
if 'saveAndExit();' not in jscode:
raise ExtractorError('`saveAndExit();` not found in `jscode`')
if not html:
html = self.extractor._download_webpage(url, video_id, note=note, headers=headers)
with open(self._TMP_FILES['html'].name, 'wb') as f:
f.write(html.encode())
self._save_cookies(url)
user_agent = headers.get('User-Agent') or self.extractor.get_param('http_headers')['User-Agent']
jscode = self._TEMPLATE.format_map(self.options.new_child({
'url': url,
'ua': user_agent.replace('"', '\\"'),
'jscode': jscode,
}))
stdout = self.execute(jscode, video_id, note=note2)
with open(self._TMP_FILES['html'].name, 'rb') as f:
html = f.read().decode('utf-8')
self._load_cookies()
return html, stdout
def execute(self, jscode, video_id=None, *, note='Executing JS'):
"""Execute JS and return stdout"""
if 'phantom.exit();' not in jscode:
jscode += ';\nphantom.exit();'
jscode = self._BASE_JS + jscode
with open(self._TMP_FILES['script'].name, 'w', encoding='utf-8') as f:
f.write(jscode)
self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}')
cmd = [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name]
self.extractor.write_debug(f'PhantomJS command line: {shell_quote(cmd)}')
try:
stdout, stderr, returncode = Popen.run(cmd, timeout=self.options['timeout'] / 1000,
text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except Exception as e:
raise ExtractorError(f'{note} failed: Unable to run PhantomJS binary', cause=e)
if returncode:
raise ExtractorError(f'{note} failed with returncode {returncode}:\n{stderr.strip()}')
return stdout
|