Commit 3db14115 by yexing

[yx] update

parent f63311b6
2.4.3 2.5.1
\ No newline at end of file \ No newline at end of file
No preview for this file type
...@@ -19685,41 +19685,28 @@ ...@@ -19685,41 +19685,28 @@
let taskInterval = null, taskNum = 0, logs = []; let taskInterval = null, taskNum = 0, logs = [];
function startTask() { function startTask() {
// chrome.storage.local.get(['isRunning']).then(console.log);
// chrome.storage.local.set({ isRunning: true });
taskNum++; taskNum++;
if (taskInterval) return; if (taskInterval) return;
chrome.storage.local.set({ isRunning: true }); chrome.storage.local.set({ isRunning: true });
// chrome.storage.local.get(null, console.log);
taskInterval = setInterval(() => { taskInterval = setInterval(() => {
// 推送更新 // 推送更新
const ackLen = logs.filter(x => x.message.includes("完成")).length; const ackLen = logs.filter(x => x.message.includes("完成")).length;
chrome.runtime.sendMessage({ type: 'progress', logs });
if (ackLen === taskNum) { if (ackLen === taskNum) {
clearInterval(taskInterval);
// recycle
taskInterval = null, taskNum = 0;
logs.push({ logs.push({
time: new Date().toLocaleTimeString(), time: new Date().toLocaleTimeString(),
message: "已全部完成" message: "已全部完成"
}); });
chrome.runtime.sendMessage({ type: 'progress', logs });
// copy logs // copy logs
chrome.storage.local.set({ isRunning: false, logs }); chrome.storage.local.set({ isRunning: false, logs });
clearInterval(taskInterval);
// recycle
taskInterval = null, taskNum = 0, logs = [];
} }
chrome.runtime.sendMessage({
type: 'progress', logs
});
}, 1000); }, 1000);
} }
const initTimedTask = () => {
for (const [key, { uri, params }] of Object.entries(TABLE)) {
const plan = new Plan(null, params);
const period = plan.get()?.period;
const when = plan.next().dt.getTime();
chrome.alarms.create(key, { when });
console.log(`[timedTask] uri: ${uri} period: ${period} when: ${when}`);
}
};
chrome.alarms.onAlarm.addListener((alarm) => { chrome.alarms.onAlarm.addListener((alarm) => {
let [key, fromDate, toDate, when] = alarm.name.split('_'); let [key, fromDate, toDate, when] = alarm.name.split('_');
if (!Object.keys(TABLE).includes(key)) return; if (!Object.keys(TABLE).includes(key)) return;
...@@ -19735,14 +19722,10 @@ ...@@ -19735,14 +19722,10 @@
sendTabMsg({ type: "run", data: { uri, period } }); sendTabMsg({ type: "run", data: { uri, period } });
}); });
// 初始化 // 初始化
initTimedTask(); // initTimedTask();
// chrome.alarms.clearAll(); // chrome.storage.local.clear();
// chrome.alarms.getAll(console.log);
const getTmpClock = () => { chrome.alarms.clearAll();
const clock = new Date();
clock.setHours(22, 0, 0, 0);
return clock.getTime();
};
globalThis.initTmpTimedTask = (when) => { globalThis.initTmpTimedTask = (when) => {
const period = ['2025-02-01', '2025-02-28']; const period = ['2025-02-01', '2025-02-28'];
for (const [key, { uri }] of Object.entries(TABLE)) { for (const [key, { uri }] of Object.entries(TABLE)) {
...@@ -19751,7 +19734,7 @@ ...@@ -19751,7 +19734,7 @@
console.log(`[timedTask] uri: ${uri} period: ${period} when: ${when}`); console.log(`[timedTask] uri: ${uri} period: ${period} when: ${when}`);
} }
}; };
globalThis.initTmpTimedTask(getTmpClock()); // globalThis.initTmpTimedTask(getTmpClock());
// initTmpTimedTask(new Date().getTime() + 10) // initTmpTimedTask(new Date().getTime() + 10)
})(); })();
...@@ -138,10 +138,6 @@ ...@@ -138,10 +138,6 @@
})); }));
return rr; return rr;
}; };
const gHeaders = Object.freeze({
"Accept-Encoding": "gzip, deflate, br",
"wm_aurora.market": "US",
});
const headers = Object.freeze({ const headers = Object.freeze({
"wm_aurora.market": "US", "wm_aurora.market": "US",
}); });
...@@ -227,6 +223,8 @@ ...@@ -227,6 +223,8 @@
if (!(this.uri && this.scid)) { if (!(this.uri && this.scid)) {
throw new Error(`uri: ${this.uri}, scid: ${this.scid}`); throw new Error(`uri: ${this.uri}, scid: ${this.scid}`);
} }
this._retryLimit = 5;
this._retryCount = 0;
} }
getFileName(sf) { getFileName(sf) {
const ext = this.options.ext || 'csv'; const ext = this.options.ext || 'csv';
...@@ -254,6 +252,19 @@ ...@@ -254,6 +252,19 @@
...this.options.requestInit ...this.options.requestInit
}); });
console.log(response); console.log(response);
if (response.status !== 200) {
if (++this._retryCount < this._retryLimit) {
await sleep(DELAY);
const text = await response.text();
console.log(`重试请求, ${text}`);
return await this.send(...arguments);
} else {
console.error(`重试失败: ${response.url}`);
return false;
}
}
this._retryCount = 0; // reset
let blob; let blob;
if (callback) { if (callback) {
blob = await callback.call(this, response); blob = await callback.call(this, response);
...@@ -625,7 +636,7 @@ ...@@ -625,7 +636,7 @@
}, { }, {
fromDate, fromDate,
toDate, toDate,
}, { headers: gHeaders }), }),
new Task({ new Task({
apiName: "poAudit", apiName: "poAudit",
showName: "Inbound receipts" showName: "Inbound receipts"
...@@ -713,6 +724,7 @@ ...@@ -713,6 +724,7 @@
} }
} }
// fetch("https://advertising.walmart.com/extend-sso")
async function webJump() { async function webJump() {
if (location.hostname === 'login.account.wal-mart.com') { if (location.hostname === 'login.account.wal-mart.com') {
document.querySelector("button").click(); document.querySelector("button").click();
......
{"name":"Test","description":"导出沃尔玛报表","version":"2.4.3","manifest_version":3,"background":{"service_worker":"./js/background.js"},"permissions":["tabs","activeTab","scripting","notifications","storage","alarms"],"action":{"default_title":"WalmartExports","default_popup":"popup.html","default_icon":{"16":"icon.png","32":"icon.png","48":"icon.png","128":"icon.png"}},"icons":{"16":"icon.png","32":"icon.png","48":"icon.png","128":"icon.png"},"web_accessible_resources":[{"resources":["src/main.js"],"matches":["<all_urls>"]}],"host_permissions":["https://seller.walmart.com/*","https://advertising.walmart.com/*","https://login.account.wal-mart.com/*","https://marketplace.walmartapis.com/*","https://walmart.meinuosha.com/*"],"content_scripts":[{"matches":["https://seller.walmart.com/*","https://advertising.walmart.com/*","https://login.account.wal-mart.com/*","https://marketplace.walmartapis.com/*","https://walmart.meinuosha.com/*"],"js":["./js/main.js"]}]} {"name":"WalmartExports","description":"导出沃尔玛报表","version":"2.5.1","manifest_version":3,"background":{"service_worker":"./js/background.js"},"permissions":["tabs","activeTab","scripting","notifications","storage","alarms"],"action":{"default_title":"WalmartExports","default_popup":"popup.html","default_icon":{"16":"icon.png","32":"icon.png","48":"icon.png","128":"icon.png"}},"icons":{"16":"icon.png","32":"icon.png","48":"icon.png","128":"icon.png"},"web_accessible_resources":[{"resources":["src/main.js"],"matches":["<all_urls>"]}],"host_permissions":["https://seller.walmart.com/*","https://advertising.walmart.com/*","https://login.account.wal-mart.com/*","https://marketplace.walmartapis.com/*","https://walmart.meinuosha.com/*"],"content_scripts":[{"matches":["https://seller.walmart.com/*","https://advertising.walmart.com/*","https://login.account.wal-mart.com/*","https://marketplace.walmartapis.com/*","https://walmart.meinuosha.com/*"],"js":["./js/main.js"]}]}
\ No newline at end of file \ No newline at end of file
...@@ -20,7 +20,7 @@ const changeManifest = (others = {}) => { ...@@ -20,7 +20,7 @@ const changeManifest = (others = {}) => {
} }
const inputs = ["src/main.js", "src/popup.js", "src/background.js"]; const inputs = ["src/main.js", "src/popup.js", "src/background.js"];
const conf = []; const conf = [];
const nodeEnv = process.env.NODE_ENV.trim() || 'DEV'; const nodeEnv = process.env.NODE_ENV?.trim() || 'DEV';
console.log(`NODE_ENV: ${nodeEnv}`); console.log(`NODE_ENV: ${nodeEnv}`);
nodeEnv == 'TEST' ? changeManifest({ name: 'Test' }) : changeManifest(); nodeEnv == 'TEST' ? changeManifest({ name: 'Test' }) : changeManifest();
inputs.forEach(input => { inputs.forEach(input => {
......
...@@ -83,32 +83,29 @@ chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => { ...@@ -83,32 +83,29 @@ chrome.runtime.onMessage.addListener((msg, sender, sendResponse) => {
let taskInterval = null, taskNum = 0, logs = []; let taskInterval = null, taskNum = 0, logs = [];
function startTask() { function startTask() {
// chrome.storage.local.get(['isRunning']).then(console.log);
// chrome.storage.local.set({ isRunning: true });
taskNum++; taskNum++;
if (taskInterval) return; if (taskInterval) return;
chrome.storage.local.set({ isRunning: true }); chrome.storage.local.set({ isRunning: true });
taskInterval = setInterval(() => { taskInterval = setInterval(() => {
// 推送更新 // 推送更新
const ackLen = logs.filter(x => x.message.includes("完成")).length; const ackLen = logs.filter(x => x.message.includes("完成")).length;
chrome.runtime.sendMessage({ type: 'progress', logs });
if (ackLen === taskNum) { if (ackLen === taskNum) {
clearInterval(taskInterval);
// recycle
taskInterval = null, taskNum = 0;
logs.push({ logs.push({
time: new Date().toLocaleTimeString(), time: new Date().toLocaleTimeString(),
message: "已全部完成" message: "已全部完成"
}); });
chrome.runtime.sendMessage({ type: 'progress', logs });
// copy logs // copy logs
chrome.storage.local.set({ isRunning: false, logs }); chrome.storage.local.set({ isRunning: false, logs });
clearInterval(taskInterval);
// recycle
taskInterval = null, taskNum = 0, logs = [];
} }
chrome.runtime.sendMessage({
type: 'progress', logs
});
}, 1000); }, 1000);
} }
// eslint-disable-next-line no-unused-vars
const initTimedTask = () => { const initTimedTask = () => {
for (const [key, { uri, params }] of Object.entries(TABLE)) { for (const [key, { uri, params }] of Object.entries(TABLE)) {
const plan = new Plan(null, params); const plan = new Plan(null, params);
...@@ -133,8 +130,11 @@ chrome.alarms.onAlarm.addListener((alarm) => { ...@@ -133,8 +130,11 @@ chrome.alarms.onAlarm.addListener((alarm) => {
sendTabMsg({ type: "run", data: { uri, period } }); sendTabMsg({ type: "run", data: { uri, period } });
}); });
// 初始化 // 初始化
initTimedTask(); // initTimedTask();
// chrome.alarms.clearAll(); // chrome.storage.local.get(null, console.log);
// chrome.storage.local.clear();
// chrome.alarms.getAll(console.log);
chrome.alarms.clearAll();
// eslint-disable-next-line no-unused-vars // eslint-disable-next-line no-unused-vars
const getTmpClock = () => { const getTmpClock = () => {
......
import { sleep, getValue, fmt0, fmt2, fmt3, JSON2CSV, createZip, renameZipFile } from "./util.js"; import { sleep, getValue, fmt0, fmt2, fmt3, JSON2CSV, createZip, renameZipFile } from "./util.js";
import { rrHeaders, headers, gHeaders } from "./header.js"; import { rrHeaders, headers } from "./header.js";
import { TABLE } from "./conf.js"; import { TABLE } from "./conf.js";
const DELAY = 2000; // 2s const DELAY = 2000; // 2s
...@@ -43,6 +43,8 @@ class Task { ...@@ -43,6 +43,8 @@ class Task {
if (!(this.uri && this.scid)) { if (!(this.uri && this.scid)) {
throw new Error(`uri: ${this.uri}, scid: ${this.scid}`); throw new Error(`uri: ${this.uri}, scid: ${this.scid}`);
} }
this._retryLimit = 5
this._retryCount = 0
} }
getFileName(sf) { getFileName(sf) {
const ext = this.options.ext || 'csv'; const ext = this.options.ext || 'csv';
...@@ -70,6 +72,19 @@ class Task { ...@@ -70,6 +72,19 @@ class Task {
...this.options.requestInit ...this.options.requestInit
}); });
console.log(response); console.log(response);
if (response.status !== 200) {
if (++this._retryCount < this._retryLimit) {
await sleep(DELAY);
const text = await response.text();
console.log(`重试请求, ${text}`);
return await this.send(...arguments);
} else {
console.error(`重试失败: ${response.url}`);
return false;
}
}
this._retryCount = 0; // reset
let blob; let blob;
if (callback) { if (callback) {
blob = await callback.call(this, response); blob = await callback.call(this, response);
...@@ -441,7 +456,7 @@ export async function createTasks(uri, period) { ...@@ -441,7 +456,7 @@ export async function createTasks(uri, period) {
}, { }, {
fromDate, fromDate,
toDate, toDate,
}, { headers: gHeaders }), }),
new Task({ new Task({
apiName: "poAudit", apiName: "poAudit",
showName: "Inbound receipts" showName: "Inbound receipts"
......
...@@ -21,10 +21,6 @@ export const rHeaders = Object.freeze({ ...@@ -21,10 +21,6 @@ export const rHeaders = Object.freeze({
"accept": "application/json", "accept": "application/json",
"wm_aurora.market": "US", "wm_aurora.market": "US",
}); });
export const gHeaders = Object.freeze({
"Accept-Encoding": "gzip, deflate, br",
"wm_aurora.market": "US",
});
export const headers = Object.freeze({ export const headers = Object.freeze({
"wm_aurora.market": "US", "wm_aurora.market": "US",
}); });
loguru
aiofiles
bs4
curl_cffi
fake_useragent
tenacity
oss2
import asyncio
import json
import os
import random
import re
import sys
import threading
import traceback
import uuid
from abc import ABCMeta, abstractmethod
import aiofiles
from bs4 import BeautifulSoup
from curl_cffi.requests import AsyncSession
from fake_useragent import UserAgent
from loguru import logger
from oss2 import Auth, Bucket
from tenacity import retry, RetryError, stop_after_attempt
def ignore_exceptions(*args, **kwargs):
pass
sys.excepthook = ignore_exceptions
UA = UserAgent(platforms=['pc'])
DOMAIN = "https://20tools.net"
OSS_CONFIG = {
}
class ProxiesError(Exception):
pass
class Tool:
@staticmethod
async def download_image(s, url):
"""
下载图片
:param s:
:param url:
:return:
"""
name = str(uuid.uuid4()) + ".jpg"
file_path = f'../data/{name}'
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'user-agent': UA.random,
}
for i in range(3):
try:
proxies = {
"http": "http://127.0.0.1:7890",
"https": "http://127.0.0.1:7890",
}
response = await s.get(url, headers, timeout=60, proxies=proxies)
content = response.content
async with aiofiles.open(file_path, 'wb') as file:
await file.write(content)
return os.path.abspath(file_path)
except Exception as e:
logger.error(f"下载图片失败: {url} {e}")
@staticmethod
async def oss_upload(s, url):
"""
上传图片到阿里云oss
:param s:
:param url:
:return:
"""
# 下载图片并上传
image = await Tool.download_image(s, url)
if image:
for i in range(5):
try:
save_name = f'temp-image/{str(uuid.uuid4())}.jpg'
auth = Auth(OSS_CONFIG['access_key_id'], OSS_CONFIG['access_key_secret'])
bucket = Bucket(auth, OSS_CONFIG['endpoint'], OSS_CONFIG['bucket_name'])
bucket.put_object_from_file(save_name, image)
# 删除本地图片
os.remove(image)
return {
url: "https://yunyi-live.oss-cn-hangzhou.aliyuncs.com/" + save_name
}
except:
logger.error(f"上传图片失败: {url}")
@staticmethod
async def replace_case(_text, sub_str, replace_str):
"""
替换指定字符串,不区分大小写
:param _text:
:param sub_str:
:param replace_str:
:return:
"""
compileObj = re.compile(re.escape(sub_str), re.IGNORECASE)
resultantStr = compileObj.sub(replace_str, _text)
return resultantStr
@staticmethod
async def remove_duplicate_images(image_list):
seen_images = set()
unique_images = []
for image in image_list:
if image is None:
continue
# 图片处理
new_image = image.replace('_AC_SL1500_', '').replace('_AC_SX1500_', '')
if new_image not in seen_images:
unique_images.append(image)
seen_images.add(new_image)
return unique_images
@staticmethod
async def remove_img_with_maximum(_html, _max=0):
"""
删除html中超出指定数量的img标签
:param _html:
:param _max:
:return:
"""
soup = BeautifulSoup(_html, 'html.parser')
images = soup.find_all('img')
if len(images) > _max:
# 末尾删除
images = images[_max:len(images)]
for img in images:
img.decompose()
new_html_str = str(soup)
return new_html_str
@staticmethod
async def get_default_headers():
return {
"Content-Type": "application/json",
"Accept": "application/json",
}
@staticmethod
@retry(stop=stop_after_attempt(3))
async def get_task(s, task_key: str = "", number: int = 1):
"""
获取任务
:param s:
:param task_key:
:param number:
:return:
"""
url = f"{DOMAIN}/api/collection/task?number={number}&queue={task_key}"
res = await s.get(url)
if res.status_code != 200:
raise Exception(f"获取任务失败")
response = res.json()
return response.get('data', {}).get('list', {})
@staticmethod
async def get_html_to_json(text):
"""
获取HTML中的JSON数据
:param text:
:return:
"""
soup = BeautifulSoup(text, 'html.parser')
# 找到包含JSON数据的script标签
script_tag = soup.find('script', {'id': '__NEXT_DATA__', 'type': 'application/json'})
if script_tag is None:
raise Exception(f"获取JSON数据失败")
json_data = script_tag.string
return json.loads(json_data)
@staticmethod
@retry(stop=stop_after_attempt(10))
async def callback(s, data):
"""
任务回调
:param s
:param data:
:return:
"""
default_header = await Tool.get_default_headers()
url = f"{DOMAIN}/api/collection/task"
response = await s.post(url, headers=default_header, data=json.dumps(data), timeout=10)
data = response.json()
if data["code"] != 0:
raise Exception(f"回调接口失败")
return data
@staticmethod
async def get_walmart_headers():
"""
获取Walmart请求头
:return:
"""
return {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'referer': 'https://www.walmart.com/',
'user-agent': UA.edge,
# 'Cookie': 'xptwj=uz:37e0877dc804c3221718:bdVasz6/73NdttPtcK27fPToos/4oZxy1UVlykJE0nkfLqVhmrKHI452MWIshP06rvlg6Oo4i/CAN+vCMS/BDqQPJtb1jF2UpHbJHuf+N3jupylUCiRaFrAtkQyytLu9SHMeQtQzWLBWK0mgSKk5GRqrGp86eHJ+TbhU//sz5ypzHMVRNtnq; ak_bmsc=2773E11742238181A0BCDEF70472B0E8~000000000000000000000000000000~YAAQjPw7F8Q79RiRAQAAJI6FIRhEoerjT5Fn46CibnTHiZ8nQJv6pLxGhTxu+OWA2qwUlBGLclYLJGQXToJ+BTKfwzjwI5+ud07a9A2L57hKMVbIX2vAo4ZGXA9p0BWAKKn/SbV4VHTnZa/i+pIMZB0ylq0shc7noTTr/tsto5DDd/FHf5vFFICqtEujI+2AckaTNJGYW8PPy9VW6DqXCNpVcgo3qVSntqYOT1bhJLjdYtWmspJGBhEFH5vRQdC7IdN3VqC3BxJLZWzVTaQsmsJYN2Pem7MKiHyk8/AgjKv9ZRs57VOCn/YXSPICVuP9SNUGIJcxZG8Le0VS+L4XcgyY3ngnOx8XBn8MNzpceQ0rKVCI44zb1SD11n6ympD4JV12cwN8L1JFpPs=; vtc=UcbKnqHGuhRWhgfWtgcr70; bstc=UcbKnqHGuhRWhgfWtgcr70; pxcts=85e47474-52ff-11ef-8dba-4f0e258ac181; _pxvid=85e46481-52ff-11ef-8dba-e4bc4df1a16b; adblocked=true; hasACID=true; assortmentStoreId=3081; _shcc=US; _intlbu=false; hasLocData=1; abqme=true; mobileweb=0; xpth=x-o-mart%2BB2C~x-o-mverified%2Bfalse; xpa=54G-6|CoEEB|D2oRZ|ELwUx|IuElO|McEea|MoRkL|NbX17|O8sIU|OFImx|Ocfr2|SqH-y|VyWly|XIItK|eo_el|nzZmL|rd3k-|zf8aF; exp-ck=54G-61D2oRZ1ELwUx1IuElO1NbX172O8sIU1OFImx1SqH-y2VyWly1XIItK2eo_el3nzZmL1zf8aF2; _pxhd=c8185cb38f153869ee089d6ab969bf1ed0ba0f4d4e66cd414b5dfb0daa85c913:85e46481-52ff-11ef-8dba-e4bc4df1a16b; bm_mi=ABB90B88348B58A9787BACA8B2B84DC5~YAAQjPw7F7w99RiRAQAA7KGFIRgasMlRZloJg1t00D254khXjSN/IrLyrciUo2TiMd/5dzEdpQ0rZdLkmWbcqhDvW4LcpJsY0/ViOAItAsERoIpacm5TGpo4+dliNw8JD8aa2peQ5nWBF46y0YMbmPatpEzPfi+SasMjQmt+oWQMr2Q9I3p9CBFvXsmAazCwcGDmXNtTShQbyQ9Gfq93Zgc1eh3WXmhtsDw7hanPmZF2kqaqIL1bBE46OKpVQzJKpiBZVtluHYILY+4LsIaKmwxNJmW1gbqIDx8Sbm4anTBTryfr26L/s3IA5mQ3yyk=~1; _px3=27f094085ac92f8a53a7507dbb323f50efb95f173b554348e0b72a5732857d78:2VRKXx+P0wIrkvwIM7+Xtfysy6oYDVs6V9uhgK8m88W6Ck43XPZSkDLlnFReenWMPrQ3MmpViErhyjVaXANA0A==:1000:5ZwO3UHAT/3uI8KmWYckrGicT4zhb/RLBnKTB2fZu7NK2BVIs9Tp4YrQEPmeQLr27F/Csvs7uj4SuQMN8cPuZyDda7XwJIqyx7V/BlbxwhefKls21slpn9Hkiz0U44U2DITgh0p/sfol2JVGAEXwS66TjQY9DEa3M2GGuD2Xf4+3KT5MAymWIMYp1w5P3Rqtv0KcYxURCTMZDW2B3Ol9/sKFAOeEgEWRfvTh0NaYVLI=; _astc=52d130b133cbd1b501460d9fdae93a97; xptc=_m%2B9~assortmentStoreId%2B3081; xpm=1%2B1722844296%2BUcbKnqHGuhRWhgfWtgcr70~%2B0; _pxde=c92ceb7d7d808ccee6d120ac60cabdac3b14ba9f42bfb2ff4ed5e6ef8f8a7396:eyJ0aW1lc3RhbXAiOjE3MjI4NDQyOTkyMjN9; xptwg=2769187247:ADDF1B60AE2118:1B17408:73DC67D1:89F8600E:2CEC42A1:; TS012768cf=01d032572a9131c004c984f1591f1050b2bc64767650396a370f20a1d0dcb0c458b394f0f12ffbd85b8ab44153a1cbf2c143166c54; TS01a90220=01d032572a9131c004c984f1591f1050b2bc64767650396a370f20a1d0dcb0c458b394f0f12ffbd85b8ab44153a1cbf2c143166c54; TS2a5e0c5c027=08edc6f644ab20005250728372d83aee067d8ef4429ed38ad3f72422cd7beb712284fa2bb6dcc53008e648c99a113000bbea3d56aaf743f8797d1fd537dfebae66e076aea8557039e6abbfe3d370af617c79b48e084bdcd637ffb8a8b7b06568; QMReplaySample=true; io_id=2b1e23f1-a177-4e38-86da-561e276b6abf; TS016ef4c8=018f75cfbcd4def242c1bbe08d5578972d0f66b599a484d002e1540db87e4ac90c4800be2ab90e9078fff48b5e8c5739eb3d440c3b; TS01f89308=018f75cfbcd4def242c1bbe08d5578972d0f66b599a484d002e1540db87e4ac90c4800be2ab90e9078fff48b5e8c5739eb3d440c3b; TS8cb5a80e027=08bd2f8669ab2000009af9c8550cbc249ae938bfdca0492f8d384c6808c0e90a144e4024b84fbf37082893210e113000ad32e6b74f355e50fc204aad58e20722d6ed74efd203ad1c6a356b2d93d18f547cc29e00ec15f9e4c59e73bb2f5fc352; bm_sv=60519E529ABF4EFE97D2B63408DD5BD1~YAAQjPw7F35D9RiRAQAAANaFIRionbimzr8LgiM2GAxwy+I6Bu2U7faKmNM03jfRJ1ukw3hFQzT+obDLwlGwWa4HEiO9wHosev0vkl9j46QR9DoFq+6/MAGwpf9A8wuMswRgYpSFSZvyAm8uCG9mGPzhuuN5sOmxMflboFyOm2+5jFgcDmBA3WzZRPhRRy1M0xYfthXmO5D7IppDKw8+Zbzj7sG6Wdg5pUBb5XzzWaDNDswJnHdONYEd7O7hOGbyIw==~1'
}
@staticmethod
@retry(stop=stop_after_attempt(3))
async def get_proxies(s):
"""
获取代理
:param s:
:return:
"""
proxies_url = "https://20tools.net/api/proxies?sign=ftd*kcm.ygh4mjp7ERJ"
response = await s.get(proxies_url)
ip = response.text
# ip = "127.0.0.1:7890"
if not ip:
raise ProxiesError(f"获取代理失败")
if 'status' in ip:
return None
return {
"http": f"http://{ip}",
"https": f"http://{ip}"
}
@staticmethod
async def get_impersonate():
"""
获取伪装头
:return:
"""
impersonates = [
"edge99",
"edge101",
# "safari15_3",
# "safari15_5",
]
return random.choice(impersonates)
@staticmethod
@retry(stop=stop_after_attempt(3))
async def get_html(s, url, header, proxies):
"""
获取HTML内容
:param s:
:param url:
:param header:
:param proxies:
:return:
"""
content = ""
try:
s.headers.clear()
s.cookies.clear()
s.proxies.clear()
walmart_headers = await Tool.get_walmart_headers()
impersonate = await Tool.get_impersonate()
# response = await s.get(url, proxies=proxies, headers=walmart_headers, timeout=10, data={}, impersonate=impersonate)
response = await s.get(url, proxies=proxies, headers=walmart_headers, timeout=10, data={})
content = response.text
except Exception as e:
logger.error(f"获取HTML失败: {url} {e}")
pass
return content
@staticmethod
async def check_content(content):
"""
检查内容是否正常
:param content:
:return:
"""
if not content or content is None:
raise ProxiesError(f"不存在采集信息")
if not content or "Robot or human" in content:
raise ProxiesError(f"出现验证")
if 'Request Rejected' in content or content == "Forbidden":
raise ProxiesError(f"请求被拒绝")
if 'Forbidden' in content:
raise ProxiesError(f"请求被拒绝")
return content
@staticmethod
async def replace_chinese(_text):
"""
去除字符串里的中文内容
:param _text:
:return:
"""
return re.sub('[\u4e00-\u9fa5]', '', _text)
class BaseWalmartCrawler(metaclass=ABCMeta):
@abstractmethod
async def get_task_type(self):
pass
@abstractmethod
async def format_(self, content):
pass
@abstractmethod
async def run(self):
pass
@abstractmethod
async def main(self, proxies=None):
pass
@staticmethod
async def start(data, proxies, s):
"""
开始
:param data:
:param proxies:
:param s:
:return:
"""
task_name = data['task_name']
_type = data['task_type']
number = data.get('number', 10)
items = await Tool.get_task(s=s, task_key=_type, number=number)
logger.debug(f"{task_name} - 获取到任务 {len(items)} 条")
if len(items) == 0:
await asyncio.sleep(10)
logger.debug(f"{task_name} - 无任务")
return items, proxies
if proxies is None:
proxies = await Tool.get_proxies(s)
logger.debug(f"{task_name} - 代理IP: {proxies}")
if not proxies:
logger.error(f"{task_name} - 切换代理IP")
return items, proxies
if proxies == 'error':
logger.error(f"{task_name} - 没有代理IP可用")
sys.exit(0)
return items, proxies
@staticmethod
async def get_content(s, item, walmart_headers, proxies):
"""
获取内容
:param s:
:param item:
:param walmart_headers:
:param proxies:
:return:
"""
url = item['url']
content = await Tool.get_html(s, url, walmart_headers, proxies)
return await Tool.check_content(content)
@staticmethod
async def retry(s, items, success_item, data):
"""
重试
:param s:
:param items:
:param success_item:
:param data:
:return:
"""
queue = data['queue']
task_name = data['task_name']
value = data.get('value', 'url')
new_items = list()
for item in items:
if item[value] not in success_item:
new_items.append(item)
if new_items:
data = {
"data": new_items,
"queue": queue,
"type": 10,
}
logger.error(f"{task_name} - 上传失败的任务 {json.dumps(data)}")
await Tool.callback(s, data)
class Monitoring(BaseWalmartCrawler):
"""
沃尔玛商品监控
"""
task_name = "沃尔玛商品监控"
async def get_task_type(self):
"""
获取任务类型
:return:
"""
return "walmart:product"
async def format_(self, content):
"""
格式化数据
:param content:
:return:
"""
data_dict = await Tool.get_html_to_json(content)
try:
data = data_dict['props']['pageProps']['initialData']['data']
except:
raise ProxiesError(f"获取数据异常")
product = data['product']
product = {} if product is None else product
skus = []
is_link_error = ""
free_delivery = ""
freight_price = 0
if not product:
logger.error("商品不存在")
is_link_error = "Commodity nonexistence"
fulfillment_type = ""
buy_now_eligible = ""
postal_code = product.get('location', {}).get('postalCode', '')
if product:
buy_now_eligible = product.get('buyNowEligible', False)
buy_now_eligible = 'no' if buy_now_eligible is False else ''
variant_criteria = product['variantCriteria']
skus = []
variant_list = {}
for item in variant_criteria:
item_variant_list = item.get('variantList', {})
for variant in item_variant_list:
variant_list[variant['id']] = variant['name']
variants_map = product['variantsMap']
fulfillment_options = product.get('fulfillmentOptions', [])
fulfillment_type = product.get('fulfillmentType', '')
for fulfillment_option in fulfillment_options:
if fulfillment_option['__typename'] == 'ShippingOptionV2':
speed_details = fulfillment_option['speedDetails']
if speed_details:
free_delivery = speed_details.get('deliveryDate', '')
freight_price = speed_details.get('fulfillmentPrice', {})
if freight_price:
freight_price = freight_price.get('price', 0)
brand = product.get('brand', "")
for (map_id, _map) in variants_map.items():
_map = _map if _map is not None else {}
variants = _map.get('variants', {})
sku_name = []
if not variants:
sku_name = ['default']
else:
for sku in variants:
sku_name.append(variant_list[sku])
current_price = _map.get('priceInfo', {}).get('currentPrice', {})
current_price = current_price if current_price is not None else {}
skus.append({
'price': current_price.get('price', 0),
"postal_code": postal_code,
'status': _map.get('availabilityStatus', ''),
"free_delivery": free_delivery,
"brand": brand,
"is_link_error": is_link_error,
"asin": _map.get('id', ''),
"sku_name": sku_name,
"star_level": product['averageRating'],
"freight_price": freight_price,
"fulfillment_type": fulfillment_type,
"buy_now_eligible": buy_now_eligible,
'ship_from': '',
'sold_by': '',
})
if not skus:
current_price = product.get('priceInfo', {}).get('currentPrice', {})
current_price = current_price if current_price is not None else {}
skus.append({
'price': current_price.get('price', 0),
'status': product.get('availabilityStatus', ''),
"free_delivery": free_delivery,
"is_link_error": is_link_error,
"asin": product.get('id', ''),
"sku_name": "default",
"star_level": product.get('averageRating', 0),
"freight_price": freight_price,
"fulfillment_type": fulfillment_type,
"buy_now_eligible": buy_now_eligible,
"postal_code": postal_code,
'ship_from': '',
'sold_by': '',
})
return {
'is_link_error': is_link_error,
'skus': skus
}
async def main(self, proxies=None):
"""
运行
:return:
"""
walmart_headers = await Tool.get_walmart_headers()
async with AsyncSession(max_clients=50, timeout=10) as s:
upload_list = list()
items = list()
success_item = []
try:
start_data = {
'task_name': self.task_name,
'task_type': await self.get_task_type(),
'number': 10,
}
items, proxies = await self.start(start_data, proxies, s)
item_ids = [d['item_id'] for d in items]
for item in items:
logger.debug(f"{self.task_name} - 采集 {item['url']}")
content = await self.get_content(s, item, walmart_headers, proxies)
response = await self.format_(content)
for sku in response.get('skus', []):
sku['collection_type'] = item['collection_type']
sku['item_id'] = item['item_id']
sku['admin_users_id'] = item['admin_users_id']
sku['app_name'] = item.get('app_name', 'admin')
upload_list.append(sku)
success_item.append(item['url'])
except ProxiesError as e:
logger.error(f"{self.task_name} - 代理异常 - {e}")
proxies = None
except Exception as e:
logger.exception(f"{self.task_name} - 任务异常 - {e}")
if items:
data = {
"data": {
"error_items": item_ids,
"collection": upload_list,
},
"type": 5,
}
logger.success(f"{self.task_name} - 共采集到 {len(upload_list)} 条数据")
try:
await Tool.callback(s, data)
except RetryError:
logger.error(f"{self.task_name} - 回调失败")
return proxies
async def run(self):
proxies = None
while True:
proxies = await self.main(proxies)
class Search(BaseWalmartCrawler):
task_name = "沃尔玛搜索"
async def get_task_type(self):
"""
获取任务类型
:return:
"""
return "walmart:search"
async def format_(self, content):
"""
格式化数据
:param content:
:return:
"""
if 'clear your filters and start over' in content:
logger.error('搜索没有值')
return None
data_dict = await Tool.get_html_to_json(content)
result = []
try:
items = data_dict['props']['pageProps']['initialData']['searchResult']['itemStacks'][0]['items']
except:
raise ProxiesError(f"获取数据异常")
for item in items:
if item.get('__typename', 'AdPlaceholder') != 'Product':
continue
delivery = item['fulfillmentSummary'][0]['deliveryDate'] if item.get('fulfillmentSummary', None) else ""
fulfillment_type = item.get('fulfillmentType', "")
result.append({
"title": item.get('name', ""),
"price": item['priceInfo']['linePrice'],
"delivery": delivery,
"review_count": item['numberOfReviews'],
"unique_value": item['usItemId'],
"image": item['image'],
"is_prime": "",
"star_level": item['averageRating'],
"fulfillment_type": fulfillment_type
})
return result
async def main(self, proxies=None):
"""
运行
:return:
"""
walmart_headers = await Tool.get_walmart_headers()
async with AsyncSession(max_clients=50) as s:
success_item = []
items = []
try:
start_data = {
'task_name': self.task_name,
'task_type': await self.get_task_type(),
'number': 10,
}
items, proxies = await self.start(start_data, proxies, s)
for item in items:
logger.debug(f"{self.task_name} - 采集 {item['url']}")
content = await self.get_content(s, item, walmart_headers, proxies)
result = await self.format_(content)
if result is None:
success_item.append(item['url'])
continue
data = {
"data": {
"task_id": item["task_id"],
"collection_data": result,
'platform_type': item.get('platform_type', 3)
},
"type": item['callback_type'],
"admin_users_id": item["admin_users_id"],
"app_name": item.get("app_name", "admin")
}
logger.success(f"{self.task_name} - 采集完成")
try:
await Tool.callback(s, data)
except RetryError:
logger.error(f"{self.task_name} - 回调失败")
success_item.append(item['url'])
except ProxiesError as e:
logger.error(f"{self.task_name} - 代理异常 - {e}")
proxies = None
except Exception as e:
traceback.print_exc()
logger.error(f"{self.task_name} - 任务异常 - {e}")
retry_data = {
"queue": start_data['task_type'],
"task_name": self.task_name,
}
await self.retry(s, items, success_item, retry_data)
return proxies
async def run(self):
proxies = None
while True:
proxies = await self.main(proxies)
class Goods(BaseWalmartCrawler):
task_name = "沃尔玛商品"
async def get_task_type(self):
"""
获取任务类型
:return:
"""
return "walmart:product-detail"
async def format_(self, content):
"""
格式化数据
:param content:
:return:
"""
data_dict = await Tool.get_html_to_json(content)
try:
data = data_dict['props']['pageProps']['initialData']['data']
except:
raise ProxiesError(f"获取数据异常")
product = data.get('product', {})
if product is None:
return None
imageInfo = product.get('imageInfo', {})
if imageInfo:
all_images = imageInfo.get('allImages', [])
else:
all_images = []
images = []
idml = data['idml']
for image in all_images:
images.append(image.get('url', ''))
image_map = product['imageMap']
variant_criteria = product['variantCriteria']
skus = []
variant_list = {}
variant_images = {}
for item in variant_criteria:
item_variant_list = item.get('variantList', {})
for variant in item_variant_list:
variant_products = variant.get('products', [])
for variant_product in variant_products:
if variant['images']:
variant_images[variant_product] = variant['images'][0]
variant_list[variant['id']] = variant['name']
fulfillment_type = product.get('fulfillmentType', '')
variants_map = product['variantsMap']
is_link_error = ""
freight_price = ''
free_delivery = ''
fulfillment_options = product.get('fulfillmentOptions', [])
buy_now_eligible = product.get('buyNowEligible', False)
for fulfillment_option in fulfillment_options:
if fulfillment_option['__typename'] == 'ShippingOptionV2':
speed_details = fulfillment_option['speedDetails']
if speed_details:
free_delivery = speed_details.get('deliveryDate', '')
freight_price = speed_details.get('fulfillmentPrice', {})
if freight_price:
freight_price = freight_price.get('price', 0)
brand = product.get('brand', "")
product_use_item_id = product.get('usItemId', '')
product_sku_name = ''
for (id, map) in variants_map.items():
variants = map.get('variants', [])
sku_name = []
# 重新排序内容
new_variants = []
for sku in variants:
# 保证颜色为第一个
if sku.find('actual_color') != -1:
new_variants.insert(0, sku)
else:
new_variants.append(sku)
for sku in new_variants:
sku_name.append(variant_list[sku])
image_name = variant_images.get(id, "")
if image_name:
sku_image = image_map[image_name]['url']
else:
sku_image = map.get('imageInfo', {}).get('thumbnailUrl', '')
item_id = map.get('usItemId', '')
try:
price = map.get('priceInfo', {}).get('currentPrice', {}).get('price', 0)
except Exception as e:
logger.error(f"获取价格失败: {e}")
price = 0
skus.append({
'price': price,
'status': map.get('availabilityStatus', ''),
"free_delivery": free_delivery,
"brand": brand,
"is_link_error": is_link_error,
"sku_image": sku_image,
"asin": map.get('id', ''),
'item_id': item_id,
"sku_name": sku_name,
"star_level": product['averageRating'],
"freight_price": freight_price,
'fulfillment_type': fulfillment_type,
'buy_now_eligible': buy_now_eligible,
'ship_from': '',
'sold_by': '',
"delivery_info": '',
"is_buy_now": '',
})
if product_use_item_id == item_id:
product_sku_name = sku_name
if not skus:
default_sku_name = "default"
skus.append({
'price': product.get('priceInfo', {}).get('currentPrice', {}).get('price', 0),
'status': product.get('availabilityStatus', ''),
"free_delivery": free_delivery,
"brand": brand,
"is_link_error": is_link_error,
"sku_image": images[0],
"asin": product.get('id', ''),
'item_id': product_use_item_id,
"sku_name": default_sku_name,
"star_level": product['averageRating'],
"freight_price": freight_price,
'fulfillment_type': fulfillment_type,
'buy_now_eligible': buy_now_eligible,
'ship_from': '',
'sold_by': '',
"delivery_info": '',
"is_buy_now": '',
})
product_sku_name = default_sku_name
# 删除最后一张图片
images = images[0:-1]
return {
"title": product.get('name', ''),
"images": images,
"point_description": idml['shortDescription'],
"descriptions": idml['longDescription'],
"skus": skus,
'brand': brand,
"asin": product_use_item_id,
'sku_name': product_sku_name,
}
async def main(self, proxies=None):
"""
运行
:return:
"""
async with AsyncSession(max_clients=50) as s:
success_item = []
items = []
try:
start_data = {
'task_name': self.task_name,
'task_type': await self.get_task_type(),
'number': 5,
}
items, proxies = await self.start(start_data, proxies, s)
walmart_headers = await Tool.get_walmart_headers()
# 错误商品信息
for item in items:
logger.debug(f"{self.task_name} - 采集 {item['url']}")
content = await self.get_content(s, item, walmart_headers, proxies)
response = await self.format_(content)
if response is None:
logger.success(f"{item['url']} - 商品不存在")
success_item.append(item['url'])
continue
brand = list()
result = {
"default_info": {
"title": response['title'],
"images": response['images'],
"point_description": response['point_description'],
"descriptions": response['descriptions'],
'asin': response.get('asin', ''),
'sku_name': response.get('sku_name', ''),
}
}
images = result['default_info']['images']
title = await Tool.replace_chinese(result['default_info']['title'])
point_description = result['default_info']['point_description']
# 上传所有图片
download_images = []
for image in images:
try:
download_images.append(image)
except:
continue
for sku in response['skus']:
try:
download_images.append(sku['sku_image'])
except:
continue
new_images = []
# 去除相同图片
logger.debug(f"{self.task_name} - 开始下载图片")
download_images = await Tool.remove_duplicate_images(download_images)
if download_images:
download_list = []
for image in download_images:
download_list.append(Tool.oss_upload(s, image))
new_images = await asyncio.gather(*download_list)
new_images = new_images
logger.debug(f"{self.task_name} - 图片下载完成")
new_skus = []
replace_image = []
# 替换图片
for new in new_images:
for sku in response['skus']:
if sku['sku_image'] in new:
sku['sku_image'] = new[sku['sku_image']]
new_skus.append(sku)
for image in images:
if image in new:
replace_image.append(new[image])
images = replace_image
brand.append(response['brand'])
descriptions = response['descriptions']
if not descriptions:
descriptions = ""
for image in images:
descriptions += f"<img src='{image}' />"
descriptions = await Tool.remove_img_with_maximum(descriptions, 25)
new_point_description = point_description
brand = brand[0]
if brand:
descriptions = await Tool.replace_case(descriptions, brand, '')
new_point_description = await Tool.replace_case(point_description, brand, '')
new_point_description = re.sub(r'<li></li>', '', new_point_description)
title = await Tool.replace_case(title, brand, '')
# 标题超过255个字符时截取
if len(title) > 255:
title = title[0:255]
descriptions = new_point_description + descriptions
descriptions = descriptions.replace('100%', '')
title = title.replace('100%', '')
result['default_info']['descriptions'] = descriptions
result['default_info']['point_description'] = new_point_description
result['skus'] = new_skus
result['brand'] = response['brand']
result['default_info']['title'] = title
result['default_info']['images'] = images
result['url'] = item.get('url', '')
data = {
"data": {
"task_id": item.get('id', 0),
"collection_data": result,
"log_id": item.get('log_id', 0),
'platform_type': item.get('platform_type', 3),
'export_type': item.get('export_type', 1),
},
"type": 4,
"admin_users_id": item.get('admin_users_id', 0),
"app_name": item.get("app_name", "admin"),
}
# logger.success(f"上传数据: {json.dumps(data)}")
logger.success(f"{self.task_name} - 采集完成")
await Tool.callback(s, data)
success_item.append(item['url'])
except ProxiesError as e:
logger.error(f"{self.task_name} - 代理异常 - {e}")
proxies = None
except Exception as e:
traceback.print_exc()
logger.error(f"{self.task_name} - 任务异常 - {e}")
retry_data = {
"queue": start_data['task_type'],
"task_name": self.task_name,
}
await self.retry(s, items, success_item, retry_data)
return proxies
async def run(self):
proxies = None
while True:
proxies = await self.main(proxies)
class Test:
async def run(self):
while True:
try:
async with AsyncSession() as s:
url = "http://www.zdopen.com/ShortProxy/BindIP?api=202406041824314753&akey=4fee8e19764876e1&i=1"
content = await s.get(url)
logger.success(content.json())
await asyncio.sleep(30)
except:
logger.error(f"获取代理失败")
await asyncio.sleep(30)
async def run():
thread = []
for i in range(tread_number):
if task_type == "1":
t = Monitoring()
elif task_type == "3":
t = Goods()
elif task_type == "4":
t = Test()
else:
t = Search()
t.task_name = "线程-" + str(i + 1)
t = threading.Thread(target=asyncio.run, args=(t.run(),))
thread.append(t)
for t in thread:
t.start()
for t in thread:
t.join()
if __name__ == '__main__':
# task_type = input("请选择类型(默认: 1 ):\n 1.监控\n 2.搜索\n 3.商品\n 请输入: ")
task_type = sys.argv[1] if len(sys.argv) > 1 else None
if not task_type or task_type not in ["1", "2", "3", "4"]:
task_type = "1"
tread_number = sys.argv[2] if len(sys.argv) > 2 else None
if not tread_number:
tread_number = 1
else:
tread_number = int(tread_number)
logger.success(f"任务类型: {task_type}")
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.get_event_loop().run_until_complete(run())
import asyncio
import random
import traceback
import json
from urllib3.exceptions import ProxyError
from bs4 import BeautifulSoup
from curl_cffi.requests import AsyncSession, Response
from fake_useragent import UserAgent
from loguru import logger
from tenacity import retry, stop_after_attempt
UA = UserAgent(platforms=['pc'])
DOMAIN = "https://20tools.net"
class Tool:
@staticmethod
async def get_walmart_headers():
"""
获取Walmart请求头
:return:
"""
return {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'referer': 'https://www.walmart.com/',
'user-agent': UA.edge,
# 'Cookie': 'xptwj=uz:37e0877dc804c3221718:bdVasz6/73NdttPtcK27fPToos/4oZxy1UVlykJE0nkfLqVhmrKHI452MWIshP06rvlg6Oo4i/CAN+vCMS/BDqQPJtb1jF2UpHbJHuf+N3jupylUCiRaFrAtkQyytLu9SHMeQtQzWLBWK0mgSKk5GRqrGp86eHJ+TbhU//sz5ypzHMVRNtnq; ak_bmsc=2773E11742238181A0BCDEF70472B0E8~000000000000000000000000000000~YAAQjPw7F8Q79RiRAQAAJI6FIRhEoerjT5Fn46CibnTHiZ8nQJv6pLxGhTxu+OWA2qwUlBGLclYLJGQXToJ+BTKfwzjwI5+ud07a9A2L57hKMVbIX2vAo4ZGXA9p0BWAKKn/SbV4VHTnZa/i+pIMZB0ylq0shc7noTTr/tsto5DDd/FHf5vFFICqtEujI+2AckaTNJGYW8PPy9VW6DqXCNpVcgo3qVSntqYOT1bhJLjdYtWmspJGBhEFH5vRQdC7IdN3VqC3BxJLZWzVTaQsmsJYN2Pem7MKiHyk8/AgjKv9ZRs57VOCn/YXSPICVuP9SNUGIJcxZG8Le0VS+L4XcgyY3ngnOx8XBn8MNzpceQ0rKVCI44zb1SD11n6ympD4JV12cwN8L1JFpPs=; vtc=UcbKnqHGuhRWhgfWtgcr70; bstc=UcbKnqHGuhRWhgfWtgcr70; pxcts=85e47474-52ff-11ef-8dba-4f0e258ac181; _pxvid=85e46481-52ff-11ef-8dba-e4bc4df1a16b; adblocked=true; hasACID=true; assortmentStoreId=3081; _shcc=US; _intlbu=false; hasLocData=1; abqme=true; mobileweb=0; xpth=x-o-mart%2BB2C~x-o-mverified%2Bfalse; xpa=54G-6|CoEEB|D2oRZ|ELwUx|IuElO|McEea|MoRkL|NbX17|O8sIU|OFImx|Ocfr2|SqH-y|VyWly|XIItK|eo_el|nzZmL|rd3k-|zf8aF; exp-ck=54G-61D2oRZ1ELwUx1IuElO1NbX172O8sIU1OFImx1SqH-y2VyWly1XIItK2eo_el3nzZmL1zf8aF2; _pxhd=c8185cb38f153869ee089d6ab969bf1ed0ba0f4d4e66cd414b5dfb0daa85c913:85e46481-52ff-11ef-8dba-e4bc4df1a16b; bm_mi=ABB90B88348B58A9787BACA8B2B84DC5~YAAQjPw7F7w99RiRAQAA7KGFIRgasMlRZloJg1t00D254khXjSN/IrLyrciUo2TiMd/5dzEdpQ0rZdLkmWbcqhDvW4LcpJsY0/ViOAItAsERoIpacm5TGpo4+dliNw8JD8aa2peQ5nWBF46y0YMbmPatpEzPfi+SasMjQmt+oWQMr2Q9I3p9CBFvXsmAazCwcGDmXNtTShQbyQ9Gfq93Zgc1eh3WXmhtsDw7hanPmZF2kqaqIL1bBE46OKpVQzJKpiBZVtluHYILY+4LsIaKmwxNJmW1gbqIDx8Sbm4anTBTryfr26L/s3IA5mQ3yyk=~1; _px3=27f094085ac92f8a53a7507dbb323f50efb95f173b554348e0b72a5732857d78:2VRKXx+P0wIrkvwIM7+Xtfysy6oYDVs6V9uhgK8m88W6Ck43XPZSkDLlnFReenWMPrQ3MmpViErhyjVaXANA0A==:1000:5ZwO3UHAT/3uI8KmWYckrGicT4zhb/RLBnKTB2fZu7NK2BVIs9Tp4YrQEPmeQLr27F/Csvs7uj4SuQMN8cPuZyDda7XwJIqyx7V/BlbxwhefKls21slpn9Hkiz0U44U2DITgh0p/sfol2JVGAEXwS66TjQY9DEa3M2GGuD2Xf4+3KT5MAymWIMYp1w5P3Rqtv0KcYxURCTMZDW2B3Ol9/sKFAOeEgEWRfvTh0NaYVLI=; _astc=52d130b133cbd1b501460d9fdae93a97; xptc=_m%2B9~assortmentStoreId%2B3081; xpm=1%2B1722844296%2BUcbKnqHGuhRWhgfWtgcr70~%2B0; _pxde=c92ceb7d7d808ccee6d120ac60cabdac3b14ba9f42bfb2ff4ed5e6ef8f8a7396:eyJ0aW1lc3RhbXAiOjE3MjI4NDQyOTkyMjN9; xptwg=2769187247:ADDF1B60AE2118:1B17408:73DC67D1:89F8600E:2CEC42A1:; TS012768cf=01d032572a9131c004c984f1591f1050b2bc64767650396a370f20a1d0dcb0c458b394f0f12ffbd85b8ab44153a1cbf2c143166c54; TS01a90220=01d032572a9131c004c984f1591f1050b2bc64767650396a370f20a1d0dcb0c458b394f0f12ffbd85b8ab44153a1cbf2c143166c54; TS2a5e0c5c027=08edc6f644ab20005250728372d83aee067d8ef4429ed38ad3f72422cd7beb712284fa2bb6dcc53008e648c99a113000bbea3d56aaf743f8797d1fd537dfebae66e076aea8557039e6abbfe3d370af617c79b48e084bdcd637ffb8a8b7b06568; QMReplaySample=true; io_id=2b1e23f1-a177-4e38-86da-561e276b6abf; TS016ef4c8=018f75cfbcd4def242c1bbe08d5578972d0f66b599a484d002e1540db87e4ac90c4800be2ab90e9078fff48b5e8c5739eb3d440c3b; TS01f89308=018f75cfbcd4def242c1bbe08d5578972d0f66b599a484d002e1540db87e4ac90c4800be2ab90e9078fff48b5e8c5739eb3d440c3b; TS8cb5a80e027=08bd2f8669ab2000009af9c8550cbc249ae938bfdca0492f8d384c6808c0e90a144e4024b84fbf37082893210e113000ad32e6b74f355e50fc204aad58e20722d6ed74efd203ad1c6a356b2d93d18f547cc29e00ec15f9e4c59e73bb2f5fc352; bm_sv=60519E529ABF4EFE97D2B63408DD5BD1~YAAQjPw7F35D9RiRAQAAANaFIRionbimzr8LgiM2GAxwy+I6Bu2U7faKmNM03jfRJ1ukw3hFQzT+obDLwlGwWa4HEiO9wHosev0vkl9j46QR9DoFq+6/MAGwpf9A8wuMswRgYpSFSZvyAm8uCG9mGPzhuuN5sOmxMflboFyOm2+5jFgcDmBA3WzZRPhRRy1M0xYfthXmO5D7IppDKw8+Zbzj7sG6Wdg5pUBb5XzzWaDNDswJnHdONYEd7O7hOGbyIw==~1'
}
@staticmethod
async def get_impersonate():
"""
获取伪装头
:return:
"""
impersonates = [
"edge99",
"edge101",
# "safari15_3",
# "safari15_5",
]
return random.choice(impersonates)
@staticmethod
async def get_html_to_json(text):
"""
获取HTML中的JSON数据
:param text:
:return:
"""
soup = BeautifulSoup(text, 'html.parser')
# 找到包含JSON数据的script标签
script_tag = soup.find('script', {'id': '__NEXT_DATA__', 'type': 'application/json'})
# 找到评分人数
a_tag = soup.find('a', {"link-identifier": "reviewsLink"})
ratings = int(a_tag.string.rstrip(' ratings')) if a_tag else 0
if script_tag is None:
raise Exception(f"获取JSON数据失败")
json_data = json.loads(script_tag.string)
json_data["$ratings"] = ratings
return json_data
@staticmethod
@retry(stop=stop_after_attempt(3))
async def get_html(s: AsyncSession, url, header, proxies):
"""
获取HTML内容
:param s:
:param url:
:param header:
:param proxies:
:return:
"""
content = ""
try:
s.headers.clear()
s.cookies.clear()
s.proxies.clear()
walmart_headers = await Tool.get_walmart_headers()
impersonate = await Tool.get_impersonate()
# response = await s.get(url, proxies=proxies, headers=walmart_headers, timeout=10, data={}, impersonate=impersonate)
response = await s.get(url, proxies=proxies, headers=walmart_headers, timeout=10, data={})
content = response.text
# logger.debug(content)
except Exception as e:
logger.error(f"获取HTML失败: {url} {e}")
return content
@staticmethod
@retry(stop=stop_after_attempt(3))
async def get_proxies(s):
"""
获取代理
:param s:
:return:
"""
proxies_url = "https://20tools.net/api/proxies?sign=ftd*kcm.ygh4mjp7ERJ"
response = await s.get(proxies_url)
ip = response.text
# ip = "127.0.0.1:7890"
if not ip:
raise ProxyError(f"获取代理失败")
if 'status' in ip:
return None
return {
"http": f"http://{ip}",
"https": f"http://{ip}"
}
@staticmethod
@retry(stop=stop_after_attempt(3))
async def get_tasks(s: AsyncSession):
"""获取任务
:param s: _description_
:raises Exception: _description_
:return: _description_
"""
url = "https://walmart.meinuosha.com/index.php/index/index/GetstheitemitemId?accessvalue=Walmart2025PY0307"
res: Response = await s.get(url)
if res.status_code != 200:
raise Exception(f"获取任务失败")
data: dict = res.json()
return [
it["itemId"] for it in data.get("shop_items", [{}])
]
class Goods:
task_name = "沃尔玛商品"
async def format_(self, content):
"""
格式化数据
:param content:
:return:
"""
data_dict = await Tool.get_html_to_json(content)
try:
data = data_dict['props']['pageProps']['initialData']['data']
ratings = data_dict["$ratings"]
except:
raise ProxyError(f"获取数据异常")
product: dict = data.get('product', {})
if product is None:
return None
with open(f'./log/item_{product["usItemId"]}.json', 'w') as f:
json.dump(product, f)
category = [
it.get("name", '')
for it in product.get("category", {}).get("path", [{}])
]
crossed_price = (product.get("priceInfo", {}).get("wasPrice") or {}).get("price", '')
main_image = product.get("imageInfo", {}).get("allImages", [{}])[0].get("url", '')
promo_discount = product.get("promoDiscount", "")
pro_seller_flag = any(
it.get("type") == "PRO_SELLER"
for it in product.get("trustBadges", [])
)
return {
"title": product.get('name', ''),
"main_image": main_image,
"average_rating": product.get('averageRating', 0),
"ratings": ratings,
"price": product.get("priceInfo", {}).get("currentPrice", {}).get("price", 0),
"crossed_price": crossed_price,
"category": category,
'brand': product.get('brand', ""),
"seller_display_name": product.get("sellerDisplayName", ""),
"manufacturer_part_number": product.get('manufacturerProductId', ""),
"pro_seller": "PRO_SELLER" if pro_seller_flag else '',
"promo_discount": promo_discount,
}
async def main(self, proxies=None):
"""
运行
:return:
"""
async with AsyncSession(max_clients=50) as s:
try:
ip_lst = await Tool.get_tasks(s)
# k = len(ip_lst)
for ip in random.choices(ip_lst, k=1):
# ip = 8126721115
url = f"https://www.walmart.com/ip/{ip}"
walmart_headers = await Tool.get_walmart_headers()
proxies = await Tool.get_proxies(s)
content = await Tool.get_html(s, url, walmart_headers, proxies)
# content = await Tool.get_html(s, url, walmart_headers, None)
response = await self.format_(content)
logger.success(f"{url} - {response}")
except Exception as e:
traceback.print_exc()
logger.error(f"{self.task_name} - 任务异常 - {e}")
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(Goods().main())
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment