Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
amazon-mult-site-sync
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
yexing
amazon-mult-site-sync
Commits
767321a7
Commit
767321a7
authored
Jun 13, 2025
by
yexing
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
u
parent
58d033ad
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
316 additions
and
1 deletions
+316
-1
cookie_api.py
+158
-0
spider/base_detail.py
+0
-1
spider/base_info.py
+158
-0
No files found.
cookie_api.py
0 → 100644
View file @
767321a7
import
asyncio
import
json
import
random
import
re
from
bs4
import
BeautifulSoup
from
curl_cffi.requests
import
AsyncSession
from
loguru
import
logger
# from utils.admin_api import callback_cookie
async
def
get_headers
():
user_agent
=
await
get_rand_ua
()
return
{
'authority'
:
'www.amazon.com'
,
'accept'
:
'text/html,*/*'
,
'referer'
:
'https://www.amazon.com/'
,
'user-agent'
:
user_agent
,
}
async
def
get_rand_ua
():
"""
获取随机UA
:return:
"""
version_list
=
[
"115.0.5790.171"
,
"115.0.5790.110"
,
"115.0.5790.102"
,
"115.0.5790.99"
,
"114.0.5735.199"
,
"114.0.5735.134"
,
"114.0.5735.110"
,
"114.0.5735.91"
,
"113.0.5672.127"
,
"113.0.5672.93"
,
"113.0.5672.64"
,
"112.0.5615.138"
,
"112.0.5615.121"
,
"112.0.5615.87"
,
"112.0.5615.50"
,
"111.0.5563.147"
,
"111.0.5563.111"
,
"111.0.5563.65"
,
"110.0.5481.178"
,
"110.0.5481.104"
,
"110.0.5481.100"
,
"110.0.5481.97"
,
"110.0.5481.78"
,
"109.0.5414.120"
,
"109.0.5414.75"
,
"108.0.5359.99"
,
"108.0.5359.95"
,
"108.0.5359.72"
,
"107.0.5304.107"
,
"107.0.5304.88"
,
"107.0.5304.63"
,
"106.0.5249.119"
,
"106.0.5249.103"
,
"106.0.5249.91"
,
"106.0.5249.62"
,
"105.0.5195.127"
,
"105.0.5195.102"
,
"104.0.5112.102"
,
"104.0.5112.81"
,
"103.0.5060.134"
,
"103.0.5060.114"
,
"103.0.5060.66"
,
"103.0.5060.53"
,
"102.0.5005.115"
,
"102.0.5005.63"
,
"101.0.4951.67"
,
"101.0.4951.64"
,
"101.0.4951.54"
,
"101.0.4951.41"
,
"100.0.4896.127"
,
"100.0.4896.88"
,
"100.0.4896.75"
,
"100.0.4896.60"
,
"99.0.4844.84"
,
"99.0.4844.82"
,
"99.0.4844.74"
,
"99.0.4844.51"
,
"98.0.4758.102"
,
"98.0.4758.82"
,
"98.0.4758.80"
,
"97.0.4692.99"
,
"97.0.4692.71"
,
"96.0.4664.110"
,
"96.0.4664.93"
,
"96.0.4664.45"
,
"95.0.4638.69"
,
"95.0.4638.54"
,
"94.0.4606.81"
,
"94.0.4606.71"
,
"94.0.4606.61"
,
"94.0.4606.54"
,
"93.0.4577.82"
,
"93.0.4577.63"
,
"92.0.4515.159"
,
"92.0.4515.131"
,
"92.0.4515.107"
,
"91.0.4472.164"
,
"91.0.4472.124"
,
"91.0.4472.114"
,
"91.0.4472.106"
,
"91.0.4472.101"
,
"91.0.4472.77"
,
"90.0.4430.212"
,
"90.0.4430.93"
,
"90.0.4430.85"
,
"90.0.4430.72"
]
windows_list
=
[
"Windows NT 10.0; Win64; x64"
,
"Windows NT 10.0; WOW64"
,
"Windows NT 6.3; Win64; x64"
,
"Windows NT 6.3; WOW64"
,
"Windows NT 6.2; Win64; x64"
,
"Windows NT 6.2; WOW64"
,
]
version
=
random
.
choice
(
version_list
)
window
=
random
.
choice
(
windows_list
)
return
f
"Mozilla/5.0 ({window}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{version} Safari/537.36"
async
def
get_proxies
():
# proxy = "http://127.0.0.1:7890"
# return {
# "https": proxy,
# "http": proxy,
# }
return
None
async
def
main
(
zip_code
):
headers
=
await
get_headers
()
proxies
=
await
get_proxies
()
async
with
AsyncSession
(
max_clients
=
1
)
as
s
:
url
=
"https://www.amazon.com"
response
=
await
s
.
get
(
url
,
headers
=
headers
,
proxies
=
proxies
)
html
=
BeautifulSoup
(
response
.
text
,
'html.parser'
,
from_encoding
=
'utf-8'
)
captcha
=
html
.
find
(
'input'
,
id
=
"captchacharacters"
)
if
captcha
:
raise
Exception
(
"出现验证码了"
)
data_modal_action
=
html
.
find
(
'span'
,
id
=
'nav-global-location-data-modal-action'
)
if
not
data_modal_action
:
raise
Exception
(
"获取data_modal_action失败"
)
data_modal
=
data_modal_action
.
get
(
'data-a-modal'
)
if
data_modal
:
data_modal
=
json
.
loads
(
data_modal
)
csrf_token
=
data_modal
.
get
(
'ajaxHeaders'
,
{})
.
get
(
'anti-csrftoken-a2z'
)
logger
.
info
(
f
"获取csrf_token成功: {csrf_token}"
)
headers
[
'anti-csrftoken-a2z'
]
=
csrf_token
url
=
"https://www.amazon.com/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Gateway&storeContext=NoStoreName&actionSource=desktop-modal"
response
=
await
s
.
request
(
"GET"
,
url
,
headers
=
headers
,
proxies
=
proxies
)
csrf_token
=
re
.
findall
(
'CSRF_TOKEN :
\"
([
\
s
\
S]*?)
\"
'
,
response
.
text
)
if
len
(
csrf_token
)
==
0
:
raise
Exception
(
"获取csrf_token失败"
)
headers
[
'anti-csrftoken-a2z'
]
=
csrf_token
[
0
]
logger
.
info
(
f
"获取csrf_token成功: {csrf_token[0]}"
)
url
=
"https://www.amazon.com/portal-migration/hz/glow/address-change?actionSource=glow"
payload
=
json
.
dumps
({
"locationType"
:
"LOCATION_INPUT"
,
"zipCode"
:
"20001"
,
"deviceType"
:
"web"
,
"storeContext"
:
"generic"
,
"pageType"
:
"Gateway"
,
"actionSource"
:
"glow"
})
new_headers
=
{
'authority'
:
'www.amazon.com'
,
'accept'
:
'text/html,*/*'
,
'origin'
:
'https://www.amazon.com'
,
'referer'
:
'https://www.amazon.com/'
,
'x-requested-with'
:
'XMLHttpRequest'
,
'content-type'
:
'application/json'
}
headers
.
update
(
new_headers
)
response
=
await
s
.
request
(
"POST"
,
url
,
headers
=
headers
,
data
=
payload
,
proxies
=
proxies
)
address_text
=
response
.
text
logger
.
info
(
f
"设置邮编返回值: {address_text}"
)
if
address_text
:
address_data
=
json
.
loads
(
address_text
)
if
address_data
.
get
(
'address'
,
{})
.
get
(
'zipCode'
,
''
)
!=
zip_code
:
raise
Exception
(
"邮编验证失败"
)
url
=
"https://www.amazon.com"
response
=
await
s
.
request
(
"GET"
,
url
,
headers
=
headers
,
proxies
=
proxies
)
html
=
BeautifulSoup
(
response
.
text
,
'html.parser'
,
from_encoding
=
'utf-8'
)
data
=
html
.
find
(
'span'
,
id
=
"glow-ingress-line2"
)
.
text
if
zip_code
not
in
data
:
raise
Exception
(
"邮编验证失败"
)
cookies
=
s
.
cookies
.
items
()
# 拼接为字符串cookie
cookie
=
''
for
name
,
value
in
cookies
:
cookie
+=
'{0}={1};'
.
format
(
name
,
value
)
cookie
=
cookie
[:
-
1
]
result
=
{
"data"
:
{
"cookie"
:
cookie
,
"user_agent"
:
headers
[
'user-agent'
],
},
"type"
:
1
,
}
logger
.
success
(
f
"设置邮编成功, {json.dumps(result)}"
)
# callback_response = await callback_cookie(result)
# logger.success(f"回调cookie: {json.dumps(callback_response)}")
await
asyncio
.
sleep
(
3
)
if
__name__
==
'__main__'
:
asyncio
.
set_event_loop_policy
(
asyncio
.
WindowsSelectorEventLoopPolicy
())
zip_code
=
"20001"
while
True
:
try
:
asyncio
.
run
(
main
(
zip_code
))
except
Exception
as
e
:
logger
.
error
(
e
)
spider/base_detail.py
View file @
767321a7
...
...
@@ -19,7 +19,6 @@ from db import RedisSingleton
from
proxy
import
ProxyManager
from
tool
import
Fmt
,
Task
,
Request
redis_config
=
config
[
"redis"
]
task_monitoring_config
=
config
[
"task-product-detail"
]
redis_singleton
=
RedisSingleton
(
redis_url
=
config
[
"redis"
][
"url"
])
...
...
spider/base_info.py
0 → 100644
View file @
767321a7
import
json
import
re
from
const
import
Postcode
,
Site
from
db
import
RedisSingleton
from
proxy
import
ProxyManager
from
tool
import
Request
,
Task
from
conf
import
config
REDIS
=
RedisSingleton
(
redis_url
=
config
[
"redis"
][
"url"
])
task_monitoring_config
=
config
[
"task-product-detail"
]
class
Tool
:
@staticmethod
def
get_amazon_sku
(
text
):
"""
获取amazon_sku
:param text:
:return:
"""
amazon_skus
=
re
.
findall
(
r"dimensionValuesDisplayData\" : ({[\s\S]*?}),"
,
text
)
if
len
(
amazon_skus
)
==
0
:
amazon_skus
=
re
.
findall
(
r"dimensionValuesData\": ({[\s\S]*?}),"
,
text
)
if
len
(
amazon_skus
):
amazon_skus
=
json
.
loads
(
amazon_skus
[
0
])
return
amazon_skus
@staticmethod
def
get_url_asin
(
url
:
str
):
patterns
=
[
"dp/(.+?)
\
?"
,
"dp/(.+?)
\
/"
,
"dp/(.+)"
]
asin
=
""
for
pattern
in
patterns
:
asin
=
re
.
findall
(
pattern
,
url
)
if
len
(
asin
):
asin
=
asin
[
0
]
if
len
(
asin
)
>
10
:
asin
=
asin
.
split
(
"/"
)[
0
]
break
return
asin
@staticmethod
def
get_book_asin
(
url
:
str
,
asin
:
str
):
"""
获取图书的asin
:param url:
:param asin:
:return:
"""
if
url
==
"javascript:void(0)"
:
return
asin
asin
=
re
.
findall
(
r"/dp/(\w+)"
,
url
)
if
not
asin
:
return
""
return
asin
[
0
]
@staticmethod
def
get_title
(
html
):
"""
获取标题
:param html:
:return:
"""
title
=
html
.
xpath
(
'//span[@id="productTitle"]/text()'
)
if
len
(
title
)
==
0
:
title
=
html
.
xpath
(
'//span[@id="bond-title-desktop"]/text()'
)
return
title
@staticmethod
def
get_data_json
(
text
):
"""
获取data_json
:param text:
:return:
"""
data_json
=
re
.
findall
(
"jQuery.parseJSON
\
('(.*)'
\
)"
,
text
)
data_json
=
data_json
[
0
]
.
replace
(
"
\\
'"
,
"'"
)
return
json
.
loads
(
data_json
)
class
ProxyMixin
:
proxy_manager
=
ProxyManager
(
REDIS
)
def
get_proxy
(
self
):
"""
:return:
"""
# if self.is_debug:
# test_proxy = "127.0.0.1:7890"
# proxy = "#1#2#127.0.0.1:7890"
# else:
proxy
=
self
.
proxy_manager
.
get_proxy
()
if
proxy
is
None
:
return
None
test_proxy
=
proxy
.
split
(
"#"
)[
0
]
return
{
"proxy"
:
f
"chensav:chensav@{test_proxy}"
,
"temp_proxy"
:
proxy
,
}
def
join_proxy
(
self
,
proxy
):
"""
加入代理
:param proxy:
:return:
"""
return
self
.
proxy_manager
.
join_proxy
(
proxy
)
def
delete_proxy
(
self
,
proxy
):
"""
删除代理
:param proxy:
:return:
"""
proxy_name
=
proxy
.
split
(
"#"
)[
1
]
return
self
.
proxy_manager
.
delete_proxy
(
proxy_name
)
def
cookie_error
(
self
):
"""
cookie错误
:return:
"""
redis
=
REDIS
.
get_connection
()
redis
.
incr
(
"amazon:cookie-error"
)
class
Info
(
ProxyMixin
):
site
=
Site
.
com
postcode
=
Postcode
.
com
task_manager
=
Task
(
REDIS
)
def
__init__
(
self
):
self
.
task_key
=
task_monitoring_config
[
"task_key"
]
self
.
item_key
=
task_monitoring_config
[
"item_key"
]
self
.
task_number
=
int
(
task_monitoring_config
[
"task_number"
])
self
.
enabled
=
task_monitoring_config
[
"enabled"
]
==
"True"
self
.
request_timeout
=
int
(
task_monitoring_config
[
"request_timeout"
])
self
.
is_debug
=
task_monitoring_config
[
"is_debug"
]
==
"True"
def
run
(
self
,
task
):
url
=
task
.
get
(
"url"
,
""
)
asin
=
Tool
.
get_url_asin
(
url
)
url
=
f
"https://www.amazon.{self.site}/dp/"
+
asin
+
"?th=1&psc=1"
_proxy
=
self
.
get_proxy
()
headers
=
self
.
task_manager
.
get_loca_cookie
(
site
=
self
.
site
)
Request
.
request_html
(
url
,
_proxy
[
"proxy"
],
**
{
"headers"
:
headers
,
"timeout"
:
self
.
request_timeout
,
"postcode"
:
self
.
postcode
,
},
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment