浏览器安装:playwright install

使用无痕浏览器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
browser = pw.chromium.launch(
# proxy={
# "server": "http://175.6.136.136:20000", # 代理服务器地址
# 'username': '', # 代理服务器的用户名
# 'password': '' # 代理服务器的密码
# },
headless=False,
args=[
"--start-maximized",
"--disable-features=AutomationControlled",
"--disable-blink-features",
],
ignore_default_args=["--enable-automation"],
# executable_path="C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
)
context = browser.new_context(
no_viewport=True, # 最大化
ignore_https_errors=True,
)
page = context.new_page()
code = (
"""Object.defineProperties(navigator, {webdriver:{get:()=>undefined}});"""
)
page.add_init_script(code)

使用代理(另一种方式)

1
2
3
4
5
6
7
8
9
10
browser = pw.chromium.connect_over_cdp(endpoint_url="ws://localhost:8080/?stealth")
context = browser.new_context(
proxy={
"server": "http://175.6.136.136:20000",
"username": "",
"password": "",
},
no_viewport=True, # 最大化
ignore_https_errors=True,
)

使用浏览器缓存

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
self.browser = self.pw.chromium.launch_persistent_context(
# proxy={"server": "http://10.20.1.19:31280"},
headless=False,
executable_path="C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
args=[
"--start-maximized",
"--disable-features=AutomationControlled",
"--disable-blink-features",
],
ignore_default_args=["--enable-automation"],
user_data_dir="C:\\Users\\14276\\Desktop\\btConfig",
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
no_viewport=True,
ignore_https_errors=True,
)
self.page = self.browser.new_page()

基本用法:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
self.page.goto(self.home_url)
self.page.click("div.switch-switch.email")
self.page.wait_for_timeout(1000)
self.page.fill('input[fieldname="item_label_email"]', "") # 清空
self.page.type(
'input[fieldname="item_label_email"]',
self.username,
delay=random.randint(100, 150),
) # 输入
frames = self.page.frames # iframe
# 检查元素是否存在
ele = self.page.query_selector(slider_css)
# 获取元素文本
ele = self.page.locator("div.ecom-dropdown-trigger>span:first-of-type")
self.shop_name = ele.text_content()
# 获取元素坐标
slider = self.frame.locator("img#captcha-verify_img_slide")
cdn = slider.bounding_box()
length = cdn["x"] + 27 + 4
high = cdn["y"]
# 鼠标滑动操作
self.frame.hover("img#captcha-verify_img_slide")
self.page.mouse.down()
self.page.mouse.move(length, high)
for x in self.get_track(real_length):
length = length + x
self.page.mouse.move(length, high)
self.page.wait_for_timeout(random.randint(10, 15))
self.page.mouse.up()

拦截请求

1
2
3
4
5
6
page.route('**/*', handle_request)

def handle_request(route, request):
if "strategy_market" in request.url:
print(request.url)
route.continue_()

发起请求

浏览器中xmlHttpRequest和Fetch的区别前言 随着前后端分离模式的流行,在项目开发过程中必不可少的一项就是 - 掘金

  • GET(XmlHttpRequest)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def rpc_get_code(item_url, item_body):
exe_js_code = """
async () => {
async function getData() {
// 定义请求的URL和参数
const url = 'item_url';
const params = 'item_body';
const fullUrl = `${url}?${params}`;

const xhr = new XMLHttpRequest();

// 初始化一个Promise来处理异步请求
return new Promise((resolve, reject) => {
// 配置请求类型为GET
xhr.open('GET', fullUrl, true);

// 设置请求完成的回调函数
xhr.onload = function () {
if (xhr.status >= 200 && xhr.status < 300) {
// 请求成功,将响应数据传递给resolve
const data = JSON.parse(xhr.responseText);
resolve(data); // 将JSON对象传递给resolve
} else {
// 请求失败,将错误传递给reject
reject({
status: xhr.status,
statusText: xhr.statusText
});
}
};

// 设置请求失败的回调函数
xhr.onerror = function () {
reject({
status: xhr.status,
statusText: xhr.statusText
});
};
xhr.send();
});
};
var res = await getData()
return res
}
"""
new_js = exe_js_code.replace("item_body", item_body).replace(
"item_url", item_url
)
return new_js
  • POST(Fetch)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def rpc_post_code(item_url, item_body):
exe_js_code = """
async () => {
async function postData() {
const body = JSON.stringify(item_body);
const url = 'item_url';
return new Promise(function (resolve, reject) {
fetch(url, {
method: 'POST',
body: body
})
.then(response => response.json())
.then(data => {
resolve(data);
});
})
};
var res=await postData()
return res
}
"""
new_js = exe_js_code.replace("item_body", json.dumps(item_body)).replace(
"item_url", item_url
)
return new_js

开启stealth模式

pip install playwright_stealth

1
2
3
from playwright_stealth import stealth_sync  # 同步模式

stealth_sync(self.page)
1
2
3
4
5
6
7
8
ck_dct = [
{"name": name.strip(), "value": value.strip(), "domain": ".jd.com", "path": "/"}
for k, v in cookies.items()
]
with sync_playwright() as pw:
browser = pw.chromium.launch()
context = browser.new_context()
context.add_cookies(ck_dct)
1
2
for ck in self.page.context.cookies():
cookie += f"{ck['name']}={ck['value']};"

贝塞尔曲线轨迹

From:https://pypi.org/project/python-ghost-cursor/

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from python_ghost_cursor import path as slide_path

In [4]: start = {"x": 0, "y": 0}
...: end = {"x": 40, "y": 20}

In [5]: slide_path(start, end)
Out[5]:
[{'x': 0, 'y': 0},
{'x': 1.7205508520070967, 'y': 0},
{'x': 3.280150746033031, 'y': 0},
{'x': 4.762752498693698, 'y': 0},
{'x': 6.252308926604997, 'y': 0},
{'x': 7.832772846382823, 'y': 0},
{'x': 9.588097074643073, 'y': 0},
{'x': 11.602234428001648, 'y': 0},
{'x': 13.959137723074438, 'y': 0},
{'x': 16.742759776477342, 'y': 0},
{'x': 20.03705340482626, 'y': 0},
{'x': 23.92597142473709, 'y': 1.0269500874731925},
{'x': 28.493466652825727, 'y': 6.196217858339223},
{'x': 33.82349190570806, 'y': 12.498489441945145},
{'x': 40.0, 'y': 20.0}]