Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
W
wfs_export
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
yexing
wfs_export
Commits
48f21530
Commit
48f21530
authored
Jan 06, 2026
by
yexing
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
优化代码
parent
e04e6148
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
105 additions
and
66 deletions
+105
-66
src/walmart_demo.py
+104
-65
src/walmart_sup.py
+1
-1
No files found.
src/walmart_demo.py
View file @
48f21530
...
@@ -9,15 +9,14 @@ from datetime import datetime, time
...
@@ -9,15 +9,14 @@ from datetime import datetime, time
from
typing
import
Optional
,
List
from
typing
import
Optional
,
List
import
redis.asyncio
as
redis
import
redis.asyncio
as
redis
from
requests.exceptions
import
ProxyError
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
curl_cffi.requests
import
AsyncSession
,
Response
from
curl_cffi.requests
import
AsyncSession
,
Response
from
curl_cffi.requests.exceptions
import
Timeout
,
ConnectionError
,
ProxyError
from
fake_useragent
import
UserAgent
from
fake_useragent
import
UserAgent
from
loguru
import
logger
from
loguru
import
logger
from
tenacity
import
retry
,
stop_after_attempt
from
tenacity
import
retry
,
stop_after_attempt
,
wait_random
DOMAIN
=
"https://20tools.net"
UA
=
UserAgent
(
platforms
=
[
"desktop"
])
UA
=
UserAgent
(
platforms
=
[
'desktop'
])
MAX_RETRIES
=
20
MAX_RETRIES
=
20
TASK_DURATION
=
600
TASK_DURATION
=
600
IS_DEBUG
=
int
(
os
.
environ
.
get
(
"IS_DEBUG"
,
False
))
IS_DEBUG
=
int
(
os
.
environ
.
get
(
"IS_DEBUG"
,
False
))
...
@@ -26,6 +25,7 @@ if not IS_DEBUG:
...
@@ -26,6 +25,7 @@ if not IS_DEBUG:
logger
.
remove
()
logger
.
remove
()
logger
.
add
(
LOG_PATH
,
level
=
"INFO"
,
rotation
=
"1 day"
,
retention
=
"1 months"
)
logger
.
add
(
LOG_PATH
,
level
=
"INFO"
,
rotation
=
"1 day"
,
retention
=
"1 months"
)
class
StrCounter
:
class
StrCounter
:
def
__init__
(
self
):
def
__init__
(
self
):
self
.
counts
=
Counter
()
self
.
counts
=
Counter
()
...
@@ -46,10 +46,12 @@ class StrCounter:
...
@@ -46,10 +46,12 @@ class StrCounter:
if
cur
==
-
count
:
if
cur
==
-
count
:
heapq
.
heappush
(
self
.
heap
,
(
count
,
s
))
heapq
.
heappush
(
self
.
heap
,
(
count
,
s
))
return
cur
,
s
return
cur
,
s
return
0
,
''
return
0
,
""
class
AsyncRedisClient
:
class
AsyncRedisClient
:
_instance
:
Optional
[
redis
.
Redis
]
=
None
_instance
:
Optional
[
redis
.
Redis
]
=
None
@staticmethod
@staticmethod
def
get_redis
()
->
redis
.
Redis
:
def
get_redis
()
->
redis
.
Redis
:
"""获取 Redis 客户端(异步)
"""获取 Redis 客户端(异步)
...
@@ -58,19 +60,22 @@ class AsyncRedisClient:
...
@@ -58,19 +60,22 @@ class AsyncRedisClient:
"""
"""
if
AsyncRedisClient
.
_instance
is
None
:
if
AsyncRedisClient
.
_instance
is
None
:
AsyncRedisClient
.
_instance
=
redis
.
Redis
(
AsyncRedisClient
.
_instance
=
redis
.
Redis
(
host
=
"localhost"
,
port
=
6380
,
db
=
0
,
host
=
"localhost"
,
max_connections
=
10
# 设置连接池大小
port
=
int
(
os
.
getenv
(
"REDIS_HOST"
,
6379
)),
password
=
os
.
getenv
(
"REDIS_PASSWORD"
,
"12345678"
),
db
=
0
,
max_connections
=
10
,
# 设置连接池大小
)
)
return
AsyncRedisClient
.
_instance
return
AsyncRedisClient
.
_instance
@staticmethod
@staticmethod
async
def
close_redis
():
async
def
close_redis
():
"""关闭 Redis 客户端(异步)
"""关闭 Redis 客户端(异步)"""
"""
if
AsyncRedisClient
.
_instance
is
not
None
:
if
AsyncRedisClient
.
_instance
is
not
None
:
await
AsyncRedisClient
.
_instance
.
aclose
()
await
AsyncRedisClient
.
_instance
.
aclose
()
AsyncRedisClient
.
_instance
=
None
AsyncRedisClient
.
_instance
=
None
class
Tool
:
class
Tool
:
@staticmethod
@staticmethod
async
def
get_proxy_ips
()
->
set
:
async
def
get_proxy_ips
()
->
set
:
...
@@ -88,7 +93,7 @@ class Tool:
...
@@ -88,7 +93,7 @@ class Tool:
:param obj: 字典对象
:param obj: 字典对象
:return: 替换后的字典对象
:return: 替换后的字典对象
"""
"""
return
json
.
loads
(
json
.
dumps
(
obj
)
.
replace
(
'null'
,
'""'
))
return
json
.
loads
(
json
.
dumps
(
obj
)
.
replace
(
"null"
,
'""'
))
@staticmethod
@staticmethod
def
get_walmart_headers
():
def
get_walmart_headers
():
...
@@ -98,9 +103,9 @@ class Tool:
...
@@ -98,9 +103,9 @@ class Tool:
:return:
:return:
"""
"""
return
{
return
{
'accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
,
"user-agent"
:
UA
.
chrome
,
'referer'
:
'https://www.walmart.com/'
,
# "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"
,
'user-agent'
:
UA
.
edge
,
# "referer": "https://www.walmart.com/"
,
# 'Cookie': 'xptwj=uz:37e0877dc804c3221718:bdVasz6/73NdttPtcK27fPToos/4oZxy1UVlykJE0nkfLqVhmrKHI452MWIshP06rvlg6Oo4i/CAN+vCMS/BDqQPJtb1jF2UpHbJHuf+N3jupylUCiRaFrAtkQyytLu9SHMeQtQzWLBWK0mgSKk5GRqrGp86eHJ+TbhU//sz5ypzHMVRNtnq; ak_bmsc=2773E11742238181A0BCDEF70472B0E8~000000000000000000000000000000~YAAQjPw7F8Q79RiRAQAAJI6FIRhEoerjT5Fn46CibnTHiZ8nQJv6pLxGhTxu+OWA2qwUlBGLclYLJGQXToJ+BTKfwzjwI5+ud07a9A2L57hKMVbIX2vAo4ZGXA9p0BWAKKn/SbV4VHTnZa/i+pIMZB0ylq0shc7noTTr/tsto5DDd/FHf5vFFICqtEujI+2AckaTNJGYW8PPy9VW6DqXCNpVcgo3qVSntqYOT1bhJLjdYtWmspJGBhEFH5vRQdC7IdN3VqC3BxJLZWzVTaQsmsJYN2Pem7MKiHyk8/AgjKv9ZRs57VOCn/YXSPICVuP9SNUGIJcxZG8Le0VS+L4XcgyY3ngnOx8XBn8MNzpceQ0rKVCI44zb1SD11n6ympD4JV12cwN8L1JFpPs=; vtc=UcbKnqHGuhRWhgfWtgcr70; bstc=UcbKnqHGuhRWhgfWtgcr70; pxcts=85e47474-52ff-11ef-8dba-4f0e258ac181; _pxvid=85e46481-52ff-11ef-8dba-e4bc4df1a16b; adblocked=true; hasACID=true; assortmentStoreId=3081; _shcc=US; _intlbu=false; hasLocData=1; abqme=true; mobileweb=0; xpth=x-o-mart%2BB2C~x-o-mverified%2Bfalse; xpa=54G-6|CoEEB|D2oRZ|ELwUx|IuElO|McEea|MoRkL|NbX17|O8sIU|OFImx|Ocfr2|SqH-y|VyWly|XIItK|eo_el|nzZmL|rd3k-|zf8aF; exp-ck=54G-61D2oRZ1ELwUx1IuElO1NbX172O8sIU1OFImx1SqH-y2VyWly1XIItK2eo_el3nzZmL1zf8aF2; _pxhd=c8185cb38f153869ee089d6ab969bf1ed0ba0f4d4e66cd414b5dfb0daa85c913:85e46481-52ff-11ef-8dba-e4bc4df1a16b; bm_mi=ABB90B88348B58A9787BACA8B2B84DC5~YAAQjPw7F7w99RiRAQAA7KGFIRgasMlRZloJg1t00D254khXjSN/IrLyrciUo2TiMd/5dzEdpQ0rZdLkmWbcqhDvW4LcpJsY0/ViOAItAsERoIpacm5TGpo4+dliNw8JD8aa2peQ5nWBF46y0YMbmPatpEzPfi+SasMjQmt+oWQMr2Q9I3p9CBFvXsmAazCwcGDmXNtTShQbyQ9Gfq93Zgc1eh3WXmhtsDw7hanPmZF2kqaqIL1bBE46OKpVQzJKpiBZVtluHYILY+4LsIaKmwxNJmW1gbqIDx8Sbm4anTBTryfr26L/s3IA5mQ3yyk=~1; _px3=27f094085ac92f8a53a7507dbb323f50efb95f173b554348e0b72a5732857d78:2VRKXx+P0wIrkvwIM7+Xtfysy6oYDVs6V9uhgK8m88W6Ck43XPZSkDLlnFReenWMPrQ3MmpViErhyjVaXANA0A==:1000:5ZwO3UHAT/3uI8KmWYckrGicT4zhb/RLBnKTB2fZu7NK2BVIs9Tp4YrQEPmeQLr27F/Csvs7uj4SuQMN8cPuZyDda7XwJIqyx7V/BlbxwhefKls21slpn9Hkiz0U44U2DITgh0p/sfol2JVGAEXwS66TjQY9DEa3M2GGuD2Xf4+3KT5MAymWIMYp1w5P3Rqtv0KcYxURCTMZDW2B3Ol9/sKFAOeEgEWRfvTh0NaYVLI=; _astc=52d130b133cbd1b501460d9fdae93a97; xptc=_m%2B9~assortmentStoreId%2B3081; xpm=1%2B1722844296%2BUcbKnqHGuhRWhgfWtgcr70~%2B0; _pxde=c92ceb7d7d808ccee6d120ac60cabdac3b14ba9f42bfb2ff4ed5e6ef8f8a7396:eyJ0aW1lc3RhbXAiOjE3MjI4NDQyOTkyMjN9; xptwg=2769187247:ADDF1B60AE2118:1B17408:73DC67D1:89F8600E:2CEC42A1:; TS012768cf=01d032572a9131c004c984f1591f1050b2bc64767650396a370f20a1d0dcb0c458b394f0f12ffbd85b8ab44153a1cbf2c143166c54; TS01a90220=01d032572a9131c004c984f1591f1050b2bc64767650396a370f20a1d0dcb0c458b394f0f12ffbd85b8ab44153a1cbf2c143166c54; TS2a5e0c5c027=08edc6f644ab20005250728372d83aee067d8ef4429ed38ad3f72422cd7beb712284fa2bb6dcc53008e648c99a113000bbea3d56aaf743f8797d1fd537dfebae66e076aea8557039e6abbfe3d370af617c79b48e084bdcd637ffb8a8b7b06568; QMReplaySample=true; io_id=2b1e23f1-a177-4e38-86da-561e276b6abf; TS016ef4c8=018f75cfbcd4def242c1bbe08d5578972d0f66b599a484d002e1540db87e4ac90c4800be2ab90e9078fff48b5e8c5739eb3d440c3b; TS01f89308=018f75cfbcd4def242c1bbe08d5578972d0f66b599a484d002e1540db87e4ac90c4800be2ab90e9078fff48b5e8c5739eb3d440c3b; TS8cb5a80e027=08bd2f8669ab2000009af9c8550cbc249ae938bfdca0492f8d384c6808c0e90a144e4024b84fbf37082893210e113000ad32e6b74f355e50fc204aad58e20722d6ed74efd203ad1c6a356b2d93d18f547cc29e00ec15f9e4c59e73bb2f5fc352; bm_sv=60519E529ABF4EFE97D2B63408DD5BD1~YAAQjPw7F35D9RiRAQAAANaFIRionbimzr8LgiM2GAxwy+I6Bu2U7faKmNM03jfRJ1ukw3hFQzT+obDLwlGwWa4HEiO9wHosev0vkl9j46QR9DoFq+6/MAGwpf9A8wuMswRgYpSFSZvyAm8uCG9mGPzhuuN5sOmxMflboFyOm2+5jFgcDmBA3WzZRPhRRy1M0xYfthXmO5D7IppDKw8+Zbzj7sG6Wdg5pUBb5XzzWaDNDswJnHdONYEd7O7hOGbyIw==~1'
# 'Cookie': 'xptwj=uz:37e0877dc804c3221718:bdVasz6/73NdttPtcK27fPToos/4oZxy1UVlykJE0nkfLqVhmrKHI452MWIshP06rvlg6Oo4i/CAN+vCMS/BDqQPJtb1jF2UpHbJHuf+N3jupylUCiRaFrAtkQyytLu9SHMeQtQzWLBWK0mgSKk5GRqrGp86eHJ+TbhU//sz5ypzHMVRNtnq; ak_bmsc=2773E11742238181A0BCDEF70472B0E8~000000000000000000000000000000~YAAQjPw7F8Q79RiRAQAAJI6FIRhEoerjT5Fn46CibnTHiZ8nQJv6pLxGhTxu+OWA2qwUlBGLclYLJGQXToJ+BTKfwzjwI5+ud07a9A2L57hKMVbIX2vAo4ZGXA9p0BWAKKn/SbV4VHTnZa/i+pIMZB0ylq0shc7noTTr/tsto5DDd/FHf5vFFICqtEujI+2AckaTNJGYW8PPy9VW6DqXCNpVcgo3qVSntqYOT1bhJLjdYtWmspJGBhEFH5vRQdC7IdN3VqC3BxJLZWzVTaQsmsJYN2Pem7MKiHyk8/AgjKv9ZRs57VOCn/YXSPICVuP9SNUGIJcxZG8Le0VS+L4XcgyY3ngnOx8XBn8MNzpceQ0rKVCI44zb1SD11n6ympD4JV12cwN8L1JFpPs=; vtc=UcbKnqHGuhRWhgfWtgcr70; bstc=UcbKnqHGuhRWhgfWtgcr70; pxcts=85e47474-52ff-11ef-8dba-4f0e258ac181; _pxvid=85e46481-52ff-11ef-8dba-e4bc4df1a16b; adblocked=true; hasACID=true; assortmentStoreId=3081; _shcc=US; _intlbu=false; hasLocData=1; abqme=true; mobileweb=0; xpth=x-o-mart%2BB2C~x-o-mverified%2Bfalse; xpa=54G-6|CoEEB|D2oRZ|ELwUx|IuElO|McEea|MoRkL|NbX17|O8sIU|OFImx|Ocfr2|SqH-y|VyWly|XIItK|eo_el|nzZmL|rd3k-|zf8aF; exp-ck=54G-61D2oRZ1ELwUx1IuElO1NbX172O8sIU1OFImx1SqH-y2VyWly1XIItK2eo_el3nzZmL1zf8aF2; _pxhd=c8185cb38f153869ee089d6ab969bf1ed0ba0f4d4e66cd414b5dfb0daa85c913:85e46481-52ff-11ef-8dba-e4bc4df1a16b; bm_mi=ABB90B88348B58A9787BACA8B2B84DC5~YAAQjPw7F7w99RiRAQAA7KGFIRgasMlRZloJg1t00D254khXjSN/IrLyrciUo2TiMd/5dzEdpQ0rZdLkmWbcqhDvW4LcpJsY0/ViOAItAsERoIpacm5TGpo4+dliNw8JD8aa2peQ5nWBF46y0YMbmPatpEzPfi+SasMjQmt+oWQMr2Q9I3p9CBFvXsmAazCwcGDmXNtTShQbyQ9Gfq93Zgc1eh3WXmhtsDw7hanPmZF2kqaqIL1bBE46OKpVQzJKpiBZVtluHYILY+4LsIaKmwxNJmW1gbqIDx8Sbm4anTBTryfr26L/s3IA5mQ3yyk=~1; _px3=27f094085ac92f8a53a7507dbb323f50efb95f173b554348e0b72a5732857d78:2VRKXx+P0wIrkvwIM7+Xtfysy6oYDVs6V9uhgK8m88W6Ck43XPZSkDLlnFReenWMPrQ3MmpViErhyjVaXANA0A==:1000:5ZwO3UHAT/3uI8KmWYckrGicT4zhb/RLBnKTB2fZu7NK2BVIs9Tp4YrQEPmeQLr27F/Csvs7uj4SuQMN8cPuZyDda7XwJIqyx7V/BlbxwhefKls21slpn9Hkiz0U44U2DITgh0p/sfol2JVGAEXwS66TjQY9DEa3M2GGuD2Xf4+3KT5MAymWIMYp1w5P3Rqtv0KcYxURCTMZDW2B3Ol9/sKFAOeEgEWRfvTh0NaYVLI=; _astc=52d130b133cbd1b501460d9fdae93a97; xptc=_m%2B9~assortmentStoreId%2B3081; xpm=1%2B1722844296%2BUcbKnqHGuhRWhgfWtgcr70~%2B0; _pxde=c92ceb7d7d808ccee6d120ac60cabdac3b14ba9f42bfb2ff4ed5e6ef8f8a7396:eyJ0aW1lc3RhbXAiOjE3MjI4NDQyOTkyMjN9; xptwg=2769187247:ADDF1B60AE2118:1B17408:73DC67D1:89F8600E:2CEC42A1:; TS012768cf=01d032572a9131c004c984f1591f1050b2bc64767650396a370f20a1d0dcb0c458b394f0f12ffbd85b8ab44153a1cbf2c143166c54; TS01a90220=01d032572a9131c004c984f1591f1050b2bc64767650396a370f20a1d0dcb0c458b394f0f12ffbd85b8ab44153a1cbf2c143166c54; TS2a5e0c5c027=08edc6f644ab20005250728372d83aee067d8ef4429ed38ad3f72422cd7beb712284fa2bb6dcc53008e648c99a113000bbea3d56aaf743f8797d1fd537dfebae66e076aea8557039e6abbfe3d370af617c79b48e084bdcd637ffb8a8b7b06568; QMReplaySample=true; io_id=2b1e23f1-a177-4e38-86da-561e276b6abf; TS016ef4c8=018f75cfbcd4def242c1bbe08d5578972d0f66b599a484d002e1540db87e4ac90c4800be2ab90e9078fff48b5e8c5739eb3d440c3b; TS01f89308=018f75cfbcd4def242c1bbe08d5578972d0f66b599a484d002e1540db87e4ac90c4800be2ab90e9078fff48b5e8c5739eb3d440c3b; TS8cb5a80e027=08bd2f8669ab2000009af9c8550cbc249ae938bfdca0492f8d384c6808c0e90a144e4024b84fbf37082893210e113000ad32e6b74f355e50fc204aad58e20722d6ed74efd203ad1c6a356b2d93d18f547cc29e00ec15f9e4c59e73bb2f5fc352; bm_sv=60519E529ABF4EFE97D2B63408DD5BD1~YAAQjPw7F35D9RiRAQAAANaFIRionbimzr8LgiM2GAxwy+I6Bu2U7faKmNM03jfRJ1ukw3hFQzT+obDLwlGwWa4HEiO9wHosev0vkl9j46QR9DoFq+6/MAGwpf9A8wuMswRgYpSFSZvyAm8uCG9mGPzhuuN5sOmxMflboFyOm2+5jFgcDmBA3WzZRPhRRy1M0xYfthXmO5D7IppDKw8+Zbzj7sG6Wdg5pUBb5XzzWaDNDswJnHdONYEd7O7hOGbyIw==~1'
}
}
...
@@ -112,10 +117,23 @@ class Tool:
...
@@ -112,10 +117,23 @@ class Tool:
:return:
:return:
"""
"""
impersonates
=
[
impersonates
=
[
"edge99"
,
# Chrome
"edge101"
,
"chrome99"
,
# "safari15_3",
"chrome100"
,
# "safari15_5",
"chrome101"
,
"chrome104"
,
"chrome107"
,
"chrome110"
,
"chrome116"
,
"chrome119"
,
"chrome120"
,
"chrome123"
,
"chrome124"
,
"chrome131"
,
"chrome133a"
,
"chrome136"
,
"chrome99_android"
,
"chrome131_android"
,
]
]
return
random
.
choice
(
impersonates
)
return
random
.
choice
(
impersonates
)
...
@@ -129,9 +147,11 @@ class Tool:
...
@@ -129,9 +147,11 @@ class Tool:
"""
"""
# with open(f'./log/data.html', 'w') as f:
# with open(f'./log/data.html', 'w') as f:
# json.dump(text, f)
# json.dump(text, f)
soup
=
BeautifulSoup
(
text
,
'html.parser'
)
soup
=
BeautifulSoup
(
text
,
"html.parser"
)
# 找到包含JSON数据的script标签
# 找到包含JSON数据的script标签
script_tag
=
soup
.
find
(
'script'
,
{
'id'
:
'__NEXT_DATA__'
,
'type'
:
'application/json'
})
script_tag
=
soup
.
find
(
"script"
,
{
"id"
:
"__NEXT_DATA__"
,
"type"
:
"application/json"
}
)
# 找到评分人数
# 找到评分人数
# a_tag = soup.find('a', {"link-identifier": "reviewsLink"})
# a_tag = soup.find('a', {"link-identifier": "reviewsLink"})
# ratings = int(a_tag.string.rstrip(' ratings').replace(',', '')) if a_tag else 0
# ratings = int(a_tag.string.rstrip(' ratings').replace(',', '')) if a_tag else 0
...
@@ -141,6 +161,7 @@ class Tool:
...
@@ -141,6 +161,7 @@ class Tool:
# json_data["$ratings"] = ratings
# json_data["$ratings"] = ratings
return
json_data
return
json_data
@staticmethod
async
def
upload
(
s
:
AsyncSession
,
data
:
str
):
async
def
upload
(
s
:
AsyncSession
,
data
:
str
):
url
=
"https://walmart.meinuosha.com/index.php/index/index/SetIteminformationitemId?accessvalue=Walmart2025PY0307"
url
=
"https://walmart.meinuosha.com/index.php/index/index/SetIteminformationitemId?accessvalue=Walmart2025PY0307"
headers
=
{
"content-type"
:
"application/json"
}
headers
=
{
"content-type"
:
"application/json"
}
...
@@ -148,8 +169,7 @@ class Tool:
...
@@ -148,8 +169,7 @@ class Tool:
logger
.
info
(
resp
.
text
)
logger
.
info
(
resp
.
text
)
@staticmethod
@staticmethod
@retry
(
stop
=
stop_after_attempt
(
3
))
async
def
get_html
(
s
:
AsyncSession
,
url
,
proxy_ip
):
async
def
get_html
(
s
:
AsyncSession
,
url
,
header
,
proxies
):
"""
"""
获取HTML内容
获取HTML内容
...
@@ -159,23 +179,28 @@ class Tool:
...
@@ -159,23 +179,28 @@ class Tool:
:param proxies:
:param proxies:
:return:
:return:
"""
"""
content
=
""
try
:
s
.
headers
.
clear
()
s
.
headers
.
clear
()
s
.
cookies
.
clear
()
s
.
cookies
.
clear
()
s
.
proxies
.
clear
()
s
.
proxies
.
clear
()
proxies
=
{
"http"
:
f
"http://{proxy_ip}"
,
"https"
:
f
"http://{proxy_ip}"
}
walmart_headers
=
Tool
.
get_walmart_headers
()
walmart_headers
=
Tool
.
get_walmart_headers
()
# impersonate = Tool.get_impersonate()
try
:
# response = await s.get(url, proxies=proxies, headers=walmart_headers, timeout=10, data={}, impersonate=impersonate)
impersonate
=
Tool
.
get_impersonate
()
response
=
await
s
.
get
(
url
,
proxies
=
proxies
,
headers
=
walmart_headers
,
timeout
=
10
,
data
=
{})
response
=
await
s
.
get
(
content
=
response
.
text
url
,
# logger.debug(content)
proxies
=
proxies
,
except
Exception
as
e
:
headers
=
walmart_headers
,
logger
.
error
(
f
"获取HTML失败: {url} {e}"
)
timeout
=
10
,
return
content
impersonate
=
impersonate
,
)
return
response
.
text
except
(
Timeout
,
ConnectionError
):
raise
Exception
(
f
"连接失败: {proxy_ip}"
)
except
ProxyError
:
raise
Exception
(
f
"代理失败: {proxy_ip}"
)
@staticmethod
@staticmethod
@retry
(
stop
=
stop_after_attempt
(
3
))
@retry
(
stop
=
stop_after_attempt
(
3
)
,
wait
=
wait_random
(
1
,
2
),
reraise
=
True
)
async
def
get_tasks
(
s
:
AsyncSession
)
->
list
:
async
def
get_tasks
(
s
:
AsyncSession
)
->
list
:
"""获取任务
"""获取任务
...
@@ -189,10 +214,11 @@ class Tool:
...
@@ -189,10 +214,11 @@ class Tool:
raise
Exception
(
"获取任务失败"
)
raise
Exception
(
"获取任务失败"
)
logger
.
info
(
resp
.
text
)
logger
.
info
(
resp
.
text
)
data
:
dict
=
resp
.
json
()
data
:
dict
=
resp
.
json
()
if
data
[
"status"
]
.
lower
()
==
'ok'
:
if
data
[
"status"
]
.
lower
()
==
"ok"
:
return
[
it
[
"itemId"
]
for
it
in
data
.
get
(
"shop_items"
,
[{}])]
return
[
it
[
"itemId"
]
for
it
in
data
.
get
(
"shop_items"
,
[{}])]
return
[]
return
[]
class
Goods
:
class
Goods
:
task_name
=
"沃尔玛商品"
task_name
=
"沃尔玛商品"
...
@@ -205,57 +231,72 @@ class Goods:
...
@@ -205,57 +231,72 @@ class Goods:
"""
"""
data_dict
=
await
Tool
.
get_html_to_json
(
content
)
data_dict
=
await
Tool
.
get_html_to_json
(
content
)
try
:
try
:
data
:
dict
=
data_dict
[
'props'
][
'pageProps'
][
'initialData'
][
'data'
]
data
:
dict
=
data_dict
[
"props"
][
"pageProps"
][
"initialData"
][
"data"
]
# ratings = data_dict["$ratings"]
# ratings = data_dict["$ratings"]
except
:
# noqa: E722
except
:
# noqa: E722
raise
ProxyError
(
"获取数据异常"
)
raise
Exception
(
"获取数据异常"
)
product
:
dict
=
data
.
get
(
'product'
,
{})
product
:
dict
=
data
.
get
(
"product"
,
{})
if
product
is
None
:
if
product
is
None
:
return
return
idml
:
dict
=
data
.
get
(
"idml"
)
or
{}
idml
:
dict
=
data
.
get
(
"idml"
)
or
{}
reviews
:
dict
=
data
.
get
(
"reviews"
)
or
{}
reviews
:
dict
=
data
.
get
(
"reviews"
)
or
{}
item_id
=
product
.
get
(
"usItemId"
)
or
''
item_id
=
product
.
get
(
"usItemId"
)
or
""
if
IS_DEBUG
:
if
IS_DEBUG
:
# with open(f'./log/item_{item_id}.json', 'w') as f:
# with open(f'./log/item_{item_id}.json', 'w') as f:
# json.dump(data, f)
# json.dump(data, f)
pass
pass
category
=
[
category
=
[
(
it
.
get
(
"name"
)
or
''
)
.
replace
(
"&"
,
"&"
)
(
it
.
get
(
"name"
)
or
""
)
.
replace
(
"&"
,
"&"
)
for
it
in
((
product
.
get
(
"category"
)
or
{})
.
get
(
"path"
)
or
[{}])
for
it
in
((
product
.
get
(
"category"
)
or
{})
.
get
(
"path"
)
or
[{}])
]
]
crossed_price
=
((
product
.
get
(
"priceInfo"
)
or
{})
.
get
(
"wasPrice"
)
or
{})
.
get
(
"price"
)
or
''
crossed_price
=
((
product
.
get
(
"priceInfo"
)
or
{})
.
get
(
"wasPrice"
)
or
{})
.
get
(
main_image
=
((
product
.
get
(
"imageInfo"
)
or
{})
.
get
(
"allImages"
)
or
[{}])[
0
]
.
get
(
"url"
)
or
''
"price"
discount
=
(
product
.
get
(
"promoDiscount"
)
or
{})
.
get
(
"discount"
)
or
''
)
or
""
pro_seller
=
"Pro seller"
if
any
(
main_image
=
((
product
.
get
(
"imageInfo"
)
or
{})
.
get
(
"allImages"
)
or
[{}])[
0
]
.
get
(
"url"
)
or
""
discount
=
(
product
.
get
(
"promoDiscount"
)
or
{})
.
get
(
"discount"
)
or
""
pro_seller
=
(
"Pro seller"
if
any
(
it
.
get
(
"type"
)
==
"PRO_SELLER"
it
.
get
(
"type"
)
==
"PRO_SELLER"
for
it
in
product
.
get
(
"trustBadges"
)
or
[{}]
for
it
in
product
.
get
(
"trustBadges"
)
or
[{}]
)
else
''
)
manufacturer
=
next
(
else
""
)
manufacturer
=
(
next
(
filter
(
filter
(
lambda
x
:
x
.
get
(
"name"
)
==
"Manufacturer"
,
lambda
x
:
x
.
get
(
"name"
)
==
"Manufacturer"
,
idml
.
get
(
"specifications"
,
[{}])
idml
.
get
(
"specifications"
,
[{}]),
),
),
{})
.
get
(
"value"
)
or
''
{},
)
.
get
(
"value"
)
or
""
)
return
{
return
{
"itemId"
:
item_id
,
"itemId"
:
item_id
,
"product_name"
:
product
.
get
(
'name'
)
or
''
,
"product_name"
:
product
.
get
(
"name"
)
or
""
,
"image"
:
main_image
,
"image"
:
main_image
,
"customerRating"
:
reviews
.
get
(
"roundedAverageOverallRating"
)
or
''
,
"customerRating"
:
reviews
.
get
(
"roundedAverageOverallRating"
)
or
""
,
"ratingCount"
:
reviews
.
get
(
"totalReviewCount"
)
or
''
,
"ratingCount"
:
reviews
.
get
(
"totalReviewCount"
)
or
""
,
"price"
:
((
product
.
get
(
"priceInfo"
)
or
{})
.
get
(
"currentPrice"
)
or
{})
.
get
(
"price"
)
or
0
,
"price"
:
((
product
.
get
(
"priceInfo"
)
or
{})
.
get
(
"currentPrice"
)
or
{})
.
get
(
"price"
)
or
0
,
"Crossedprice"
:
crossed_price
,
"Crossedprice"
:
crossed_price
,
"Commodityclassification"
:
category
,
"Commodityclassification"
:
category
,
'brand_name'
:
product
.
get
(
'brand'
)
or
''
,
"brand_name"
:
product
.
get
(
"brand"
)
or
""
,
"Shoppingcartstore"
:
product
.
get
(
"sellerDisplayName"
)
or
''
,
"Shoppingcartstore"
:
product
.
get
(
"sellerDisplayName"
)
or
""
,
"manufacturer"
:
manufacturer
,
"manufacturer"
:
manufacturer
,
"Pro_seller"
:
pro_seller
,
"Pro_seller"
:
pro_seller
,
"discount"
:
discount
,
"discount"
:
discount
,
}
}
async
def
subtask
(
self
,
s
:
AsyncSession
,
task_ip
:
str
,
proxy_ip
:
bytes
):
async
def
subtask
(
self
,
s
:
AsyncSession
,
task_ip
:
str
,
proxy_ip
):
"""子任务
"""子任务
:param s:
:param s:
...
@@ -264,14 +305,12 @@ class Goods:
...
@@ -264,14 +305,12 @@ class Goods:
:return:
:return:
"""
"""
try
:
try
:
if
isinstance
(
proxy_ip
,
bytes
):
proxy_ip
=
proxy_ip
.
decode
()
proxy_ip
=
proxy_ip
.
decode
()
proxies
=
{
"http"
:
f
"http://{proxy_ip}"
,
"https"
:
f
"http://{proxy_ip}"
}
url
=
f
"https://www.walmart.com/ip/{task_ip}"
url
=
f
"https://www.walmart.com/ip/{task_ip}"
walmart_headers
=
Tool
.
get_walmart_headers
()
content
=
await
Tool
.
get_html
(
s
,
url
,
proxy_ip
)
content
=
await
Tool
.
get_html
(
s
,
url
,
walmart_headers
,
proxies
)
if
"Robot or human?"
in
content
:
raise
Exception
(
f
"人机验证: {proxy_ip}"
)
data
=
await
self
.
format_
(
content
)
data
=
await
self
.
format_
(
content
)
logger
.
debug
(
f
"{url} - {data}"
)
logger
.
debug
(
f
"{url} - {data}"
)
return
data
return
data
...
@@ -279,12 +318,11 @@ class Goods:
...
@@ -279,12 +318,11 @@ class Goods:
raise
Exception
(
task_ip
,
e
)
raise
Exception
(
task_ip
,
e
)
async
def
run_during_night
(
self
):
async
def
run_during_night
(
self
):
"""在晚上运行任务
"""在晚上运行任务"""
"""
tasks
=
set
()
tasks
=
set
()
while
True
:
while
True
:
now
=
datetime
.
now
()
.
time
()
now
=
datetime
.
now
()
.
time
()
if
(
time
(
22
,
0
)
>
now
and
now
>
time
(
7
,
0
))
and
not
IS_DEBUG
:
if
(
time
(
19
,
0
)
>
now
and
now
>
time
(
2
,
0
))
and
not
IS_DEBUG
:
await
AsyncRedisClient
()
.
close_redis
()
await
AsyncRedisClient
()
.
close_redis
()
logger
.
info
(
"不在指定时间范围内, 停止运行"
)
logger
.
info
(
"不在指定时间范围内, 停止运行"
)
break
break
...
@@ -324,7 +362,7 @@ class Goods:
...
@@ -324,7 +362,7 @@ class Goods:
for
task_ip
,
proxy_ip
in
zip
(
task_ips
,
proxy_ips
)
for
task_ip
,
proxy_ip
in
zip
(
task_ips
,
proxy_ips
)
]
]
fail
=
0
fail
=
0
some
=
min
(
len
(
task_ips
),
len
(
proxy_ips
)
)
some
=
len
(
fs
)
for
f
in
asyncio
.
as_completed
(
fs
):
for
f
in
asyncio
.
as_completed
(
fs
):
try
:
try
:
shop_item
=
await
f
shop_item
=
await
f
...
@@ -348,7 +386,7 @@ class Goods:
...
@@ -348,7 +386,7 @@ class Goods:
if
IS_DEBUG
:
if
IS_DEBUG
:
# json_data["shop_items"] = shop_items * 100
# json_data["shop_items"] = shop_items * 100
pass
pass
data
=
json
.
dumps
(
json_data
,
separators
=
(
','
,
':'
))
data
=
json
.
dumps
(
json_data
,
separators
=
(
","
,
":"
))
logger
.
debug
(
data
)
logger
.
debug
(
data
)
logger
.
info
(
f
"采集完成({len(shop_items)})"
)
logger
.
info
(
f
"采集完成({len(shop_items)})"
)
await
Tool
.
upload
(
s
,
data
)
await
Tool
.
upload
(
s
,
data
)
...
@@ -360,6 +398,7 @@ class Goods:
...
@@ -360,6 +398,7 @@ class Goods:
logger
.
error
(
f
"{self.task_name} - 任务异常 - {e}"
)
logger
.
error
(
f
"{self.task_name} - 任务异常 - {e}"
)
await
AsyncRedisClient
()
.
close_redis
()
await
AsyncRedisClient
()
.
close_redis
()
if
__name__
==
'__main__'
:
if
__name__
==
"__main__"
:
loop
=
asyncio
.
get_event_loop
()
loop
=
asyncio
.
get_event_loop
()
loop
.
run_until_complete
(
Goods
()
.
run_during_night
())
loop
.
run_until_complete
(
Goods
()
.
run_during_night
())
src/walmart_sup.py
View file @
48f21530
...
@@ -65,7 +65,7 @@ class Subsidiary:
...
@@ -65,7 +65,7 @@ class Subsidiary:
"""在晚上运行任务"""
"""在晚上运行任务"""
while
True
:
while
True
:
now
=
datetime
.
now
()
.
time
()
now
=
datetime
.
now
()
.
time
()
if
time
(
22
,
0
)
>
now
and
now
>
time
(
7
,
0
):
if
time
(
19
,
0
)
>
now
and
now
>
time
(
2
,
0
):
for
task
in
self
.
_tasks
:
for
task
in
self
.
_tasks
:
if
task
and
not
task
.
done
():
if
task
and
not
task
.
done
():
task
.
cancel
()
task
.
cancel
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment