Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
amazon-mult-site-sync
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
yexing
amazon-mult-site-sync
Commits
11e330a5
Commit
11e330a5
authored
Jun 14, 2025
by
yexing
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
u
parent
9fcc7b4f
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
269 additions
and
114 deletions
+269
-114
.gitignore
+1
-2
const.py
+17
-2
cookie_api.py
+187
-82
spider/base_info.py
+45
-23
tool.py
+19
-5
No files found.
.gitignore
View file @
11e330a5
__pycache__
.
vscode
.
*
celerybeat-*
.pytest_cache
log
pid
image
...
...
const.py
View file @
11e330a5
...
...
@@ -21,6 +21,18 @@ class Data:
@classmethod
def
inverse_dict
(
cls
):
return
{
v
:
k
for
k
,
v
in
cls
.
items
()}
class
PropProxy
:
def
__init__
(
self
,
ref
:
type
,
prop
:
str
):
self
.
_ref
=
ref
self
.
_prop
=
prop
@property
def
value
(
self
):
return
getattr
(
self
.
_ref
,
self
.
_prop
)
def
__str__
(
self
):
return
str
(
self
.
value
)
class
Spider
(
Data
):
...
...
@@ -66,4 +78,8 @@ class StockStatus(Data):
class
SiteType
(
Data
):
com
=
1
de
=
2
\ No newline at end of file
de
=
2
it
=
3
fr
=
4
es
=
5
jp
=
6
cookie_api.py
View file @
11e330a5
import
asyncio
import
json
import
os
import
random
import
re
from
tenacity
import
retry
,
stop_after_attempt
,
wait_random
import
uvicorn
from
bs4
import
BeautifulSoup
from
curl_cffi.requests
import
AsyncSession
from
curl_cffi.requests
import
Session
from
fastapi
import
FastAPI
,
HTTPException
,
Query
from
loguru
import
logger
from
const
import
Postcode
,
Site
from
db
import
RedisSingleton
from
spider.base_info
import
InfoSpider
from
tool
import
Task
from
conf
import
config
# from utils.admin_api import callback_cookie
IS_DEBUG
=
os
.
environ
.
get
(
"IS_DEBUG"
,
False
)
REDIS
=
RedisSingleton
(
redis_url
=
config
[
"redis"
][
"url"
])
app
=
FastAPI
()
async
def
get_headers
():
user_agent
=
await
get_rand_ua
()
def
get_headers
():
user_agent
=
get_rand_ua
()
return
{
'authority'
:
'www.amazon.com'
,
'accept'
:
'text/html,*/*'
,
'referer'
:
'https://www.amazon.com/'
,
'user-agent'
:
user_agent
,
"authority"
:
"www.amazon.com"
,
"accept"
:
"text/html,*/*"
,
"referer"
:
"https://www.amazon.com/"
,
"user-agent"
:
user_agent
,
}
async
def
get_rand_ua
():
def
get_rand_ua
():
"""
获取随机UA
:return:
"""
version_list
=
[
"115.0.5790.171"
,
"115.0.5790.110"
,
"115.0.5790.102"
,
"115.0.5790.99"
,
"114.0.5735.199"
,
"114.0.5735.134"
,
"114.0.5735.110"
,
"114.0.5735.91"
,
"113.0.5672.127"
,
"113.0.5672.93"
,
"113.0.5672.64"
,
"112.0.5615.138"
,
"112.0.5615.121"
,
"112.0.5615.87"
,
"112.0.5615.50"
,
"111.0.5563.147"
,
"111.0.5563.111"
,
"111.0.5563.65"
,
"110.0.5481.178"
,
"110.0.5481.104"
,
"110.0.5481.100"
,
"110.0.5481.97"
,
"110.0.5481.78"
,
"109.0.5414.120"
,
"109.0.5414.75"
,
"108.0.5359.99"
,
"108.0.5359.95"
,
"108.0.5359.72"
,
"107.0.5304.107"
,
"107.0.5304.88"
,
"107.0.5304.63"
,
"106.0.5249.119"
,
"106.0.5249.103"
,
"106.0.5249.91"
,
"106.0.5249.62"
,
"105.0.5195.127"
,
"105.0.5195.102"
,
"104.0.5112.102"
,
"104.0.5112.81"
,
"103.0.5060.134"
,
"103.0.5060.114"
,
"103.0.5060.66"
,
"103.0.5060.53"
,
"102.0.5005.115"
,
"102.0.5005.63"
,
"101.0.4951.67"
,
"101.0.4951.64"
,
"101.0.4951.54"
,
"101.0.4951.41"
,
"100.0.4896.127"
,
"100.0.4896.88"
,
"100.0.4896.75"
,
"100.0.4896.60"
,
"99.0.4844.84"
,
"99.0.4844.82"
,
"99.0.4844.74"
,
"99.0.4844.51"
,
"98.0.4758.102"
,
"98.0.4758.82"
,
"98.0.4758.80"
,
"97.0.4692.99"
,
"97.0.4692.71"
,
"96.0.4664.110"
,
"96.0.4664.93"
,
"96.0.4664.45"
,
"95.0.4638.69"
,
"95.0.4638.54"
,
"94.0.4606.81"
,
"94.0.4606.71"
,
"94.0.4606.61"
,
"94.0.4606.54"
,
"93.0.4577.82"
,
"93.0.4577.63"
,
"92.0.4515.159"
,
"92.0.4515.131"
,
"92.0.4515.107"
,
"91.0.4472.164"
,
"91.0.4472.124"
,
"91.0.4472.114"
,
"91.0.4472.106"
,
"91.0.4472.101"
,
"91.0.4472.77"
,
"90.0.4430.212"
,
"90.0.4430.93"
,
"90.0.4430.85"
,
"90.0.4430.72"
]
"115.0.5790.171"
,
"115.0.5790.110"
,
"115.0.5790.102"
,
"115.0.5790.99"
,
"114.0.5735.199"
,
"114.0.5735.134"
,
"114.0.5735.110"
,
"114.0.5735.91"
,
"113.0.5672.127"
,
"113.0.5672.93"
,
"113.0.5672.64"
,
"112.0.5615.138"
,
"112.0.5615.121"
,
"112.0.5615.87"
,
"112.0.5615.50"
,
"111.0.5563.147"
,
"111.0.5563.111"
,
"111.0.5563.65"
,
"110.0.5481.178"
,
"110.0.5481.104"
,
"110.0.5481.100"
,
"110.0.5481.97"
,
"110.0.5481.78"
,
"109.0.5414.120"
,
"109.0.5414.75"
,
"108.0.5359.99"
,
"108.0.5359.95"
,
"108.0.5359.72"
,
"107.0.5304.107"
,
"107.0.5304.88"
,
"107.0.5304.63"
,
"106.0.5249.119"
,
"106.0.5249.103"
,
"106.0.5249.91"
,
"106.0.5249.62"
,
"105.0.5195.127"
,
"105.0.5195.102"
,
"104.0.5112.102"
,
"104.0.5112.81"
,
"103.0.5060.134"
,
"103.0.5060.114"
,
"103.0.5060.66"
,
"103.0.5060.53"
,
"102.0.5005.115"
,
"102.0.5005.63"
,
"101.0.4951.67"
,
"101.0.4951.64"
,
"101.0.4951.54"
,
"101.0.4951.41"
,
"100.0.4896.127"
,
"100.0.4896.88"
,
"100.0.4896.75"
,
"100.0.4896.60"
,
"99.0.4844.84"
,
"99.0.4844.82"
,
"99.0.4844.74"
,
"99.0.4844.51"
,
"98.0.4758.102"
,
"98.0.4758.82"
,
"98.0.4758.80"
,
"97.0.4692.99"
,
"97.0.4692.71"
,
"96.0.4664.110"
,
"96.0.4664.93"
,
"96.0.4664.45"
,
"95.0.4638.69"
,
"95.0.4638.54"
,
"94.0.4606.81"
,
"94.0.4606.71"
,
"94.0.4606.61"
,
"94.0.4606.54"
,
"93.0.4577.82"
,
"93.0.4577.63"
,
"92.0.4515.159"
,
"92.0.4515.131"
,
"92.0.4515.107"
,
"91.0.4472.164"
,
"91.0.4472.124"
,
"91.0.4472.114"
,
"91.0.4472.106"
,
"91.0.4472.101"
,
"91.0.4472.77"
,
"90.0.4430.212"
,
"90.0.4430.93"
,
"90.0.4430.85"
,
"90.0.4430.72"
,
]
windows_list
=
[
"Windows NT 10.0; Win64; x64"
,
"Windows NT 10.0; WOW64"
,
...
...
@@ -57,102 +139,125 @@ async def get_rand_ua():
return
f
"Mozilla/5.0 ({window}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{version} Safari/537.36"
async
def
get_proxies
():
#
proxy = "http://127.0.0.1:7890"
#
return {
#
"https": proxy,
#
"http": proxy,
#
}
return
None
def
get_proxies
():
proxy
=
"http://127.0.0.1:7890"
return
{
"https"
:
proxy
,
"http"
:
proxy
,
}
@retry
(
stop
=
stop_after_attempt
(
20
),
wait
=
wait_random
(
3
,
6
),
retry_error_callback
=
lambda
*
_
:
...
,
)
def
run
(
zip_code
):
inst
=
Task
(
REDIS
)
if
inst
.
get_loca_cookie
(
site
=
Site
.
com
,
postcode
=
zip_code
,
only_local
=
True
):
return
async
def
main
(
zip_code
):
headers
=
await
get_headers
()
proxies
=
await
get_proxies
()
async
with
AsyncSession
(
max_clients
=
1
)
as
s
:
headers
=
get_headers
()
proxies
=
get_proxies
()
with
Session
()
as
s
:
url
=
"https://www.amazon.com"
response
=
await
s
.
get
(
url
,
headers
=
headers
,
proxies
=
proxies
)
html
=
BeautifulSoup
(
response
.
text
,
'html.parser'
,
from_encoding
=
'utf-8'
)
captcha
=
html
.
find
(
'input'
,
id
=
"captchacharacters"
)
response
=
s
.
get
(
url
,
headers
=
headers
,
proxies
=
proxies
)
html
=
BeautifulSoup
(
response
.
text
,
"html.parser"
,
from_encoding
=
"utf-8"
)
captcha
=
html
.
find
(
"input"
,
id
=
"captchacharacters"
)
if
captcha
:
raise
Exception
(
"出现验证码了"
)
data_modal_action
=
html
.
find
(
'span'
,
id
=
'nav-global-location-data-modal-action'
)
data_modal_action
=
html
.
find
(
"span"
,
id
=
"nav-global-location-data-modal-action"
)
if
not
data_modal_action
:
raise
Exception
(
"获取data_modal_action失败"
)
data_modal
=
data_modal_action
.
get
(
'data-a-modal'
)
data_modal
=
data_modal_action
.
get
(
"data-a-modal"
)
if
data_modal
:
data_modal
=
json
.
loads
(
data_modal
)
csrf_token
=
data_modal
.
get
(
'ajaxHeaders'
,
{})
.
get
(
'anti-csrftoken-a2z'
)
csrf_token
=
data_modal
.
get
(
"ajaxHeaders"
,
{})
.
get
(
"anti-csrftoken-a2z"
)
logger
.
info
(
f
"获取csrf_token成功: {csrf_token}"
)
headers
[
'anti-csrftoken-a2z'
]
=
csrf_token
headers
[
"anti-csrftoken-a2z"
]
=
csrf_token
url
=
"https://www.amazon.com/portal-migration/hz/glow/get-rendered-address-selections?deviceType=desktop&pageType=Gateway&storeContext=NoStoreName&actionSource=desktop-modal"
response
=
await
s
.
request
(
"GET"
,
url
,
headers
=
headers
,
proxies
=
proxies
)
csrf_token
=
re
.
findall
(
'CSRF_TOKEN :
\"
([
\
s
\
S]*?)
\
"
'
,
response
.
text
)
response
=
s
.
request
(
"GET"
,
url
,
headers
=
headers
,
proxies
=
proxies
)
csrf_token
=
re
.
findall
(
'CSRF_TOKEN :
"([
\
s
\
S]*?)
"'
,
response
.
text
)
if
len
(
csrf_token
)
==
0
:
raise
Exception
(
"获取csrf_token失败"
)
headers
[
'anti-csrftoken-a2z'
]
=
csrf_token
[
0
]
headers
[
"anti-csrftoken-a2z"
]
=
csrf_token
[
0
]
logger
.
info
(
f
"获取csrf_token成功: {csrf_token[0]}"
)
url
=
"https://www.amazon.com/portal-migration/hz/glow/address-change?actionSource=glow"
payload
=
json
.
dumps
({
"locationType"
:
"LOCATION_INPUT"
,
"zipCode"
:
"20001"
,
"deviceType"
:
"web"
,
"storeContext"
:
"generic"
,
"pageType"
:
"Gateway"
,
"actionSource"
:
"glow"
})
payload
=
json
.
dumps
(
{
"locationType"
:
"LOCATION_INPUT"
,
"zipCode"
:
zip_code
,
"deviceType"
:
"web"
,
"storeContext"
:
"generic"
,
"pageType"
:
"Gateway"
,
"actionSource"
:
"glow"
,
}
)
new_headers
=
{
'authority'
:
'www.amazon.com'
,
'accept'
:
'text/html,*/*'
,
'origin'
:
'https://www.amazon.com'
,
'referer'
:
'https://www.amazon.com/'
,
'x-requested-with'
:
'XMLHttpRequest'
,
'content-type'
:
'application/json'
"authority"
:
"www.amazon.com"
,
"accept"
:
"text/html,*/*"
,
"origin"
:
"https://www.amazon.com"
,
"referer"
:
"https://www.amazon.com/"
,
"x-requested-with"
:
"XMLHttpRequest"
,
"content-type"
:
"application/json"
,
}
headers
.
update
(
new_headers
)
response
=
await
s
.
request
(
"POST"
,
url
,
headers
=
headers
,
data
=
payload
,
proxies
=
proxies
)
response
=
s
.
request
(
"POST"
,
url
,
headers
=
headers
,
data
=
payload
,
proxies
=
proxies
)
address_text
=
response
.
text
logger
.
info
(
f
"设置邮编返回值: {address_text}"
)
if
address_text
:
address_data
=
json
.
loads
(
address_text
)
if
address_data
.
get
(
'address'
,
{})
.
get
(
'zipCode'
,
''
)
!=
zip_code
:
if
address_data
.
get
(
"address"
,
{})
.
get
(
"zipCode"
,
""
)
!=
zip_code
:
raise
Exception
(
"邮编验证失败"
)
url
=
"https://www.amazon.com"
response
=
await
s
.
request
(
"GET"
,
url
,
headers
=
headers
,
proxies
=
proxies
)
html
=
BeautifulSoup
(
response
.
text
,
'html.parser'
,
from_encoding
=
'utf-8'
)
data
=
html
.
find
(
'span'
,
id
=
"glow-ingress-line2"
)
.
text
response
=
s
.
request
(
"GET"
,
url
,
headers
=
headers
,
proxies
=
proxies
)
html
=
BeautifulSoup
(
response
.
text
,
"html.parser"
,
from_encoding
=
"utf-8"
)
data
=
html
.
find
(
"span"
,
id
=
"glow-ingress-line2"
)
.
text
if
zip_code
not
in
data
:
raise
Exception
(
"邮编验证失败"
)
cookies
=
s
.
cookies
.
items
()
# 拼接为字符串cookie
cookie
=
''
cookie
=
""
for
name
,
value
in
cookies
:
cookie
+=
'{0}={1};'
.
format
(
name
,
value
)
cookie
+=
"{0}={1};"
.
format
(
name
,
value
)
cookie
=
cookie
[:
-
1
]
data
=
{
"cookie"
:
cookie
,
"user_agent"
:
headers
[
"user-agent"
],
}
inst
.
set_loca_cookie
(
data
,
site
=
Site
.
com
,
postcode
=
zip_code
)
result
=
{
"data"
:
{
"cookie"
:
cookie
,
"user_agent"
:
headers
[
'user-agent'
],
},
"data"
:
data
,
"type"
:
1
,
}
logger
.
success
(
f
"设置邮编成功, {json.dumps(result)}"
)
# callback_response =
await
callback_cookie(result)
# callback_response = callback_cookie(result)
# logger.success(f"回调cookie: {json.dumps(callback_response)}")
await
asyncio
.
sleep
(
3
)
if
__name__
==
'__main__'
:
asyncio
.
set_event_loop_policy
(
asyncio
.
WindowsSelectorEventLoopPolicy
())
zip_code
=
"20001"
while
True
:
try
:
asyncio
.
run
(
main
(
zip_code
))
except
Exception
as
e
:
logger
.
error
(
e
)
@app.get
(
"/query/info"
)
def
query_info
(
zip_code
:
str
=
Query
(
...
,
description
=
"邮编"
),
url
:
str
=
Query
(
...
,
description
=
"URL地址"
),
):
try
:
setattr
(
Postcode
,
"com"
,
zip_code
)
run
(
zip_code
)
return
InfoSpider
()
.
run
({
"url"
:
url
})
except
Exception
as
e
:
logger
.
error
(
e
)
raise
HTTPException
(
status_code
=
500
,
detail
=
"服务出错了"
)
if
__name__
==
"__main__"
:
uvicorn
.
run
(
app
,
host
=
"0.0.0.0"
,
port
=
9012
)
spider/base_info.py
View file @
11e330a5
from
__future__
import
annotations
import
json
import
re
import
os
import
curl_cffi
from
loguru
import
logger
from
tenacity
import
retry
,
stop_after_attempt
,
wait_random
from
lxml
import
etree
from
const
import
Postcode
,
Site
from
const
import
Postcode
,
PropProxy
,
Site
from
db
import
RedisSingleton
from
proxy
import
ProxyManager
from
tool
import
Fmt
,
Request
,
Task
from
conf
import
config
IS_DEBUG
=
os
.
environ
.
get
(
"IS_DEBUG"
,
False
)
REDIS
=
RedisSingleton
(
redis_url
=
config
[
"redis"
][
"url"
])
task_info_config
=
config
[
"task-info-detail"
]
class
Tool
:
@staticmethod
def
get_amazon_sku
(
text
):
...
...
@@ -95,14 +98,14 @@ class ProxyMixin:
:return:
"""
#
if self.is_debug:
#
test_proxy = "127.0.0.1:7890"
# proxy = "#1#2#127.0.0.1:7890"
#
else:
proxy
=
self
.
proxy_manager
.
get_proxy
()
if
proxy
is
None
:
return
None
test_proxy
=
proxy
.
split
(
"#"
)[
0
]
if
self
.
is_debug
:
test_proxy
=
"127.0.0.1:7890"
proxy
=
"#1#2#127.0.0.1:7890"
else
:
proxy
=
self
.
proxy_manager
.
get_proxy
()
if
proxy
is
None
:
return
None
test_proxy
=
proxy
.
split
(
"#"
)[
0
]
return
{
"proxy"
:
f
"chensav:chensav@{test_proxy}"
,
"temp_proxy"
:
proxy
,
...
...
@@ -134,11 +137,11 @@ class ProxyMixin:
redis
.
incr
(
"amazon:cookie-error"
)
class
Info
(
ProxyMixin
):
class
Info
Spider
(
ProxyMixin
):
site
=
Site
.
com
postcode
=
P
ostcode
.
com
postcode
=
P
ropProxy
(
Postcode
,
site
)
task_manager
=
Task
(
REDIS
)
def
__init__
(
self
):
self
.
task_key
=
task_info_config
[
"task_key"
]
self
.
item_key
=
task_info_config
[
"item_key"
]
...
...
@@ -146,39 +149,58 @@ class Info(ProxyMixin):
self
.
enabled
=
task_info_config
[
"enabled"
]
==
"True"
self
.
request_timeout
=
int
(
task_info_config
[
"request_timeout"
])
self
.
is_debug
=
task_info_config
[
"is_debug"
]
==
"True"
def
format_content
(
self
,
text
):
html
=
etree
.
HTML
(
text
)
free_delivery
=
html
.
xpath
(
'//div[@id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE"]/span/span/text()'
)
detail_bullets
=
html
.
xpath
(
'//*[@id="productDetails_detailBullets_sections1"]/tbody/tr'
)
ths
:
list
[
etree
.
_Element
]
=
html
.
xpath
(
'//*[@id="productDetails_detailBullets_sections1"]/tr/th'
)
tds
:
list
[
etree
.
_Element
]
=
html
.
xpath
(
'//*[@id="productDetails_detailBullets_sections1"]/tr/td'
)
detail
=
{
th
.
text
.
strip
():
td
.
text
.
strip
()
for
th
,
td
in
zip
(
ths
,
tds
)}
free_delivery
=
Fmt
.
parse_date
(
free_delivery
[
0
])
if
len
(
free_delivery
)
else
""
return
{
"free_delivery"
:
free_delivery
,
"product_dimensions"
:
detail
.
get
(
"Product Dimensions"
,
""
),
"item_weight"
:
detail
.
get
(
"Item Weight"
,
""
),
}
@retry
(
stop
=
stop_after_attempt
(
20
),
wait
=
wait_random
(
3
,
6
),
retry_error_cls
=
lambda
*
_
:
...
)
def
run
(
self
,
task
):
@retry
(
stop
=
stop_after_attempt
(
20
),
wait
=
wait_random
(
3
,
6
),
retry_error_callback
=
lambda
*
_
:
...
,
)
def
run
(
self
,
task
:
dict
):
url
=
task
.
get
(
"url"
,
""
)
asin
=
Tool
.
get_url_asin
(
url
)
url
=
f
"https://www.amazon.{self.site}/dp/"
+
asin
+
"?th=1&psc=1"
_proxy
=
self
.
get_proxy
()
# if IS_DEBUG:
# _proxy = {"proxy": None}
if
_proxy
is
None
:
raise
Exception
(
"没有代理"
)
raise
ValueError
(
"没有代理"
)
try
:
headers
=
self
.
task_manager
.
get_loca_cookie
(
site
=
self
.
site
)
headers
=
self
.
task_manager
.
get_loca_cookie
(
site
=
self
.
site
,
postcode
=
self
.
postcode
.
value
)
text
=
Request
.
request_html
(
url
,
_proxy
[
"proxy"
],
**
{
"headers"
:
headers
,
"timeout"
:
self
.
request_timeout
,
"postcode"
:
self
.
postcode
},
**
{
"headers"
:
headers
,
"timeout"
:
self
.
request_timeout
,
"postcode"
:
self
.
postcode
.
value
,
},
)
response
=
self
.
format_content
(
text
)
logger
.
debug
(
response
)
return
response
except
curl_cffi
.
curl
.
CurlError
:
logger
.
error
(
f
"请求超时: {url}"
)
raise
except
Exception
as
e
:
if
str
(
e
)
==
"出现验证码"
:
self
.
delete_proxy
(
_proxy
[
"temp_proxy"
])
...
...
@@ -186,6 +208,6 @@ class Info(ProxyMixin):
if
str
(
e
)
==
"采集邮编错误"
:
self
.
cookie_error
()
logger
.
error
(
f
"请求异常: {e} - {url}"
)
raise
finally
:
self
.
join_proxy
(
_proxy
[
"temp_proxy"
])
tool.py
View file @
11e330a5
...
...
@@ -14,8 +14,9 @@ from loguru import logger
from
lxml
import
etree
from
conf
import
config
from
const
import
Lang
,
StockStatus
from
const
import
Lang
,
S
ite
,
S
tockStatus
from
const
import
SiteType
from
db
import
RedisSingleton
DOMAIN
=
config
[
"app"
][
"domain"
]
COOKIE
=
config
[
"cookie"
]
...
...
@@ -23,7 +24,7 @@ IMPERSONATE = config["app"]["impersonate"]
class
Task
:
def
__init__
(
self
,
redis_singleton
):
def
__init__
(
self
,
redis_singleton
:
RedisSingleton
):
self
.
redis_singleton
=
redis_singleton
def
get_task
(
self
,
task_key
:
str
=
""
,
batch_size
:
int
=
10
):
...
...
@@ -120,19 +121,32 @@ class Task:
redis_client
.
delete
(
time_key
)
return
cookie
def
get_loca_cookie
(
self
,
site
:
str
=
"com"
):
def
get_loca_cookie
(
self
,
site
:
str
=
Site
.
com
,
postcode
:
str
=
None
,
only_local
:
bool
=
False
):
"""
获取本地cookie
:return:
"""
redis_client
=
self
.
redis_singleton
.
get_connection
()
cookie
=
redis_client
.
get
(
f
"cookie:{site}"
)
key
=
f
"cookie:{site}"
if
postcode
:
key
+=
f
":{postcode}"
cookie
=
redis_client
.
get
(
key
)
if
only_local
:
return
cookie
if
not
cookie
:
cookie
=
self
.
get_cookie
(
site
)
if
isinstance
(
cookie
,
dict
):
return
cookie
return
json
.
loads
(
cookie
)
def
set_loca_cookie
(
self
,
data
:
dict
,
site
:
str
=
Site
.
com
,
postcode
:
str
=
None
):
redis_client
=
self
.
redis_singleton
.
get_connection
()
key
=
f
"cookie:{site}"
if
postcode
:
key
+=
f
":{postcode}"
redis_client
.
set
(
key
,
json
.
dumps
(
data
))
class
Request
:
...
...
@@ -189,7 +203,7 @@ class Request:
is_link_error
=
html
.
xpath
(
'//div[@id="g"]/a/@href'
)
title
=
Tool
.
get_title
(
html
)
if
len
(
is_link_error
)
==
0
and
len
(
title
)
==
0
and
is_product_detail
:
raise
Exception
(
f
"采集内容有误"
)
raise
Exception
(
"采集内容有误"
)
return
text
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment