Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
A
amazon-mult-site-sync
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
yexing
amazon-mult-site-sync
Commits
11e330a5
Commit
11e330a5
authored
Jun 14, 2025
by
yexing
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
u
parent
9fcc7b4f
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
82 additions
and
32 deletions
+82
-32
.gitignore
+1
-2
const.py
+17
-2
cookie_api.py
+0
-0
spider/base_info.py
+45
-23
tool.py
+19
-5
No files found.
.gitignore
View file @
11e330a5
__pycache__
.
vscode
.
*
celerybeat-*
.pytest_cache
log
pid
image
...
...
const.py
View file @
11e330a5
...
...
@@ -21,6 +21,18 @@ class Data:
@classmethod
def
inverse_dict
(
cls
):
return
{
v
:
k
for
k
,
v
in
cls
.
items
()}
class
PropProxy
:
def
__init__
(
self
,
ref
:
type
,
prop
:
str
):
self
.
_ref
=
ref
self
.
_prop
=
prop
@property
def
value
(
self
):
return
getattr
(
self
.
_ref
,
self
.
_prop
)
def
__str__
(
self
):
return
str
(
self
.
value
)
class
Spider
(
Data
):
...
...
@@ -66,4 +78,8 @@ class StockStatus(Data):
class
SiteType
(
Data
):
com
=
1
de
=
2
\ No newline at end of file
de
=
2
it
=
3
fr
=
4
es
=
5
jp
=
6
cookie_api.py
View file @
11e330a5
This diff is collapsed.
Click to expand it.
spider/base_info.py
View file @
11e330a5
from
__future__
import
annotations
import
json
import
re
import
os
import
curl_cffi
from
loguru
import
logger
from
tenacity
import
retry
,
stop_after_attempt
,
wait_random
from
lxml
import
etree
from
const
import
Postcode
,
Site
from
const
import
Postcode
,
PropProxy
,
Site
from
db
import
RedisSingleton
from
proxy
import
ProxyManager
from
tool
import
Fmt
,
Request
,
Task
from
conf
import
config
IS_DEBUG
=
os
.
environ
.
get
(
"IS_DEBUG"
,
False
)
REDIS
=
RedisSingleton
(
redis_url
=
config
[
"redis"
][
"url"
])
task_info_config
=
config
[
"task-info-detail"
]
class
Tool
:
@staticmethod
def
get_amazon_sku
(
text
):
...
...
@@ -95,14 +98,14 @@ class ProxyMixin:
:return:
"""
#
if self.is_debug:
#
test_proxy = "127.0.0.1:7890"
# proxy = "#1#2#127.0.0.1:7890"
#
else:
proxy
=
self
.
proxy_manager
.
get_proxy
()
if
proxy
is
None
:
return
None
test_proxy
=
proxy
.
split
(
"#"
)[
0
]
if
self
.
is_debug
:
test_proxy
=
"127.0.0.1:7890"
proxy
=
"#1#2#127.0.0.1:7890"
else
:
proxy
=
self
.
proxy_manager
.
get_proxy
()
if
proxy
is
None
:
return
None
test_proxy
=
proxy
.
split
(
"#"
)[
0
]
return
{
"proxy"
:
f
"chensav:chensav@{test_proxy}"
,
"temp_proxy"
:
proxy
,
...
...
@@ -134,11 +137,11 @@ class ProxyMixin:
redis
.
incr
(
"amazon:cookie-error"
)
class
Info
(
ProxyMixin
):
class
Info
Spider
(
ProxyMixin
):
site
=
Site
.
com
postcode
=
P
ostcode
.
com
postcode
=
P
ropProxy
(
Postcode
,
site
)
task_manager
=
Task
(
REDIS
)
def
__init__
(
self
):
self
.
task_key
=
task_info_config
[
"task_key"
]
self
.
item_key
=
task_info_config
[
"item_key"
]
...
...
@@ -146,39 +149,58 @@ class Info(ProxyMixin):
self
.
enabled
=
task_info_config
[
"enabled"
]
==
"True"
self
.
request_timeout
=
int
(
task_info_config
[
"request_timeout"
])
self
.
is_debug
=
task_info_config
[
"is_debug"
]
==
"True"
def
format_content
(
self
,
text
):
html
=
etree
.
HTML
(
text
)
free_delivery
=
html
.
xpath
(
'//div[@id="mir-layout-DELIVERY_BLOCK-slot-PRIMARY_DELIVERY_MESSAGE_LARGE"]/span/span/text()'
)
detail_bullets
=
html
.
xpath
(
'//*[@id="productDetails_detailBullets_sections1"]/tbody/tr'
)
ths
:
list
[
etree
.
_Element
]
=
html
.
xpath
(
'//*[@id="productDetails_detailBullets_sections1"]/tr/th'
)
tds
:
list
[
etree
.
_Element
]
=
html
.
xpath
(
'//*[@id="productDetails_detailBullets_sections1"]/tr/td'
)
detail
=
{
th
.
text
.
strip
():
td
.
text
.
strip
()
for
th
,
td
in
zip
(
ths
,
tds
)}
free_delivery
=
Fmt
.
parse_date
(
free_delivery
[
0
])
if
len
(
free_delivery
)
else
""
return
{
"free_delivery"
:
free_delivery
,
"product_dimensions"
:
detail
.
get
(
"Product Dimensions"
,
""
),
"item_weight"
:
detail
.
get
(
"Item Weight"
,
""
),
}
@retry
(
stop
=
stop_after_attempt
(
20
),
wait
=
wait_random
(
3
,
6
),
retry_error_cls
=
lambda
*
_
:
...
)
def
run
(
self
,
task
):
@retry
(
stop
=
stop_after_attempt
(
20
),
wait
=
wait_random
(
3
,
6
),
retry_error_callback
=
lambda
*
_
:
...
,
)
def
run
(
self
,
task
:
dict
):
url
=
task
.
get
(
"url"
,
""
)
asin
=
Tool
.
get_url_asin
(
url
)
url
=
f
"https://www.amazon.{self.site}/dp/"
+
asin
+
"?th=1&psc=1"
_proxy
=
self
.
get_proxy
()
# if IS_DEBUG:
# _proxy = {"proxy": None}
if
_proxy
is
None
:
raise
Exception
(
"没有代理"
)
raise
ValueError
(
"没有代理"
)
try
:
headers
=
self
.
task_manager
.
get_loca_cookie
(
site
=
self
.
site
)
headers
=
self
.
task_manager
.
get_loca_cookie
(
site
=
self
.
site
,
postcode
=
self
.
postcode
.
value
)
text
=
Request
.
request_html
(
url
,
_proxy
[
"proxy"
],
**
{
"headers"
:
headers
,
"timeout"
:
self
.
request_timeout
,
"postcode"
:
self
.
postcode
},
**
{
"headers"
:
headers
,
"timeout"
:
self
.
request_timeout
,
"postcode"
:
self
.
postcode
.
value
,
},
)
response
=
self
.
format_content
(
text
)
logger
.
debug
(
response
)
return
response
except
curl_cffi
.
curl
.
CurlError
:
logger
.
error
(
f
"请求超时: {url}"
)
raise
except
Exception
as
e
:
if
str
(
e
)
==
"出现验证码"
:
self
.
delete_proxy
(
_proxy
[
"temp_proxy"
])
...
...
@@ -186,6 +208,6 @@ class Info(ProxyMixin):
if
str
(
e
)
==
"采集邮编错误"
:
self
.
cookie_error
()
logger
.
error
(
f
"请求异常: {e} - {url}"
)
raise
finally
:
self
.
join_proxy
(
_proxy
[
"temp_proxy"
])
tool.py
View file @
11e330a5
...
...
@@ -14,8 +14,9 @@ from loguru import logger
from
lxml
import
etree
from
conf
import
config
from
const
import
Lang
,
StockStatus
from
const
import
Lang
,
S
ite
,
S
tockStatus
from
const
import
SiteType
from
db
import
RedisSingleton
DOMAIN
=
config
[
"app"
][
"domain"
]
COOKIE
=
config
[
"cookie"
]
...
...
@@ -23,7 +24,7 @@ IMPERSONATE = config["app"]["impersonate"]
class
Task
:
def
__init__
(
self
,
redis_singleton
):
def
__init__
(
self
,
redis_singleton
:
RedisSingleton
):
self
.
redis_singleton
=
redis_singleton
def
get_task
(
self
,
task_key
:
str
=
""
,
batch_size
:
int
=
10
):
...
...
@@ -120,19 +121,32 @@ class Task:
redis_client
.
delete
(
time_key
)
return
cookie
def
get_loca_cookie
(
self
,
site
:
str
=
"com"
):
def
get_loca_cookie
(
self
,
site
:
str
=
Site
.
com
,
postcode
:
str
=
None
,
only_local
:
bool
=
False
):
"""
获取本地cookie
:return:
"""
redis_client
=
self
.
redis_singleton
.
get_connection
()
cookie
=
redis_client
.
get
(
f
"cookie:{site}"
)
key
=
f
"cookie:{site}"
if
postcode
:
key
+=
f
":{postcode}"
cookie
=
redis_client
.
get
(
key
)
if
only_local
:
return
cookie
if
not
cookie
:
cookie
=
self
.
get_cookie
(
site
)
if
isinstance
(
cookie
,
dict
):
return
cookie
return
json
.
loads
(
cookie
)
def
set_loca_cookie
(
self
,
data
:
dict
,
site
:
str
=
Site
.
com
,
postcode
:
str
=
None
):
redis_client
=
self
.
redis_singleton
.
get_connection
()
key
=
f
"cookie:{site}"
if
postcode
:
key
+=
f
":{postcode}"
redis_client
.
set
(
key
,
json
.
dumps
(
data
))
class
Request
:
...
...
@@ -189,7 +203,7 @@ class Request:
is_link_error
=
html
.
xpath
(
'//div[@id="g"]/a/@href'
)
title
=
Tool
.
get_title
(
html
)
if
len
(
is_link_error
)
==
0
and
len
(
title
)
==
0
and
is_product_detail
:
raise
Exception
(
f
"采集内容有误"
)
raise
Exception
(
"采集内容有误"
)
return
text
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment