
在尝试从谷歌地图抓取商家信息时,一个常见的问题是无法可靠地获取所有商家的评论数量和平均星级。原始代码片段展示了使用Playwright进行抓取的过程,其中遍历了商家列表,点击每个商家以打开其详细信息面板,然后尝试从中提取数据。
尽管原始代码成功地获取了部分商家的评论数据(例如,8个商家中只获取了4个),但未能实现完全抓取。这通常源于以下几个核心问题:
鉴于谷歌地图的动态特性和Playwright在处理特定上下文定位时的潜在挑战,推荐使用Selenium WebDriver。Selenium通过模拟真实浏览器行为,结合其强大的等待机制和灵活的元素定位方法,可以更有效地处理这类动态网站。
以下是使用Selenium实现谷歌地图评论抓取的核心步骤和代码示例:
首先,确保您已安装Selenium和对应的浏览器驱动(例如ChromeDriver)。
pip install selenium
核心逻辑流程:
示例代码结构:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import time
# 定义一个类来存储商家信息
class Business:
def __init__(self):
self.name = "N/A"
self.address = "N/A"
self.website = "N/A"
self.phone_number = "N/A"
self.category = "N/A"
self.reviews_average = None
self.reviews_count = None
# 配置WebDriver
# 假设ChromeDriver在系统PATH中,或者指定路径
# service = Service(executable_path='/path/to/chromedriver')
driver = webdriver.Chrome() # service=service
try:
# 1. 导航到谷歌地图搜索结果页
driver.get("https://www.google.com/maps/search/restaurants+near+me") # 示例URL
# 2. 智能等待页面加载并处理可能的Cookie同意弹窗
# 尝试查找并点击同意按钮 (根据实际页面结构调整XPath/Selector)
try:
WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, "//button[contains(@aria-label, 'Accept all')]"))
).click()
print("Accepted cookies.")
time.sleep(2) # 给页面一些时间来处理
except:
print("No cookie consent popup found or already handled.")
# 3. 滚动加载更多商家列表(如果需要)
# 谷歌地图通常是无限滚动,需要模拟滚动操作
# 示例:滚动几次以加载更多列表项
scrollable_div_xpath = '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[1]/div[1]' # 示例滚动容器XPath
try:
scrollable_div = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, scrollable_div_xpath))
)
for _ in range(3): # 滚动3次,每次滚动到底部
driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scrollable_div)
time.sleep(3) # 等待新内容加载
print(f"Scrolled {_ + 1} times.")
except Exception as e:
print(f"Could not find scrollable div or error during scroll: {e}")
# 4. 获取所有商家列表项
# 使用更稳定的CSS选择器或XPath,例如查找所有带有特定数据属性的列表项
# 注意:这里的XPath/CSS选择器需要根据实际的谷歌地图HTML结构来确定
# 示例:查找所有列表中的商家卡片
listing_elements_xpath = '//div[@role="article" and @aria-label]'
listings = WebDriverWait(driver, 15).until(
EC.presence_of_all_elements_located((By.XPATH, listing_elements_xpath))
)
print(f"Found {len(listings)} listings.")
scraped_businesses = []
total_scraped_reviews = 0
for index, listing in enumerate(listings):
try:
print(f"\n--- Processing listing {index + 1} ---")
# 5. 点击单个商家列表项
# 确保元素是可点击的
WebDriverWait(driver, 10).until(EC.element_to_be_clickable(listing)).click()
print(f"Clicked listing {index + 1}.")
# 6. 智能等待详细信息面板加载
# 找到详细信息面板中的一个稳定元素,例如商家名称或评论区
# 注意:这里的XPath需要针对详细信息面板的结构进行调整
# 商家名称在详细信息面板中的XPath
detail_name_xpath = '//div[contains(@class, "qBF1Pd fontHeadlineSmall ")]'
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.XPATH, detail_name_xpath))
)
print("Detail panel loaded.")
business = Business()
# 7. 从详细信息面板中提取数据
# 这里的XPath不再依赖于原始列表的索引,而是针对当前打开的详细信息面板
try:
business.name = driver.find_element(By.XPATH, detail_name_xpath).text
except:
business.name = "N/A"
try:
# 地址:data-item-id="address"
business.address = driver.find_element(By.XPATH, '//button[@data-item-id="address"]//div[contains(@class, "fontBodyMedium")]').text
except:
business.address = "N/A"
try:
# 网站:data-item-id="authority"
business.website = driver.find_element(By.XPATH, '//a[@data-item-id="authority"]//div[contains(@class, "fontBodyMedium")]').text
except:
business.website = "N/A"
try:
# 电话:data-item-id="phone:tel:"
business.phone_number = driver.find_element(By.XPATH, '//button[contains(@data-item-id, "phone:tel:")]//div[contains(@class, "fontBodyMedium")]').text
except:
business.phone_number = "N/A"
try:
# 类别:通常在名称下方,可能有特定的类或父元素
# 这是一个示例XPath,可能需要根据实际页面结构调整
business.category = driver.find_element(By.XPATH, '//*[@id="QA0Szd"]/div/div/div[1]/div[3]/div/div[1]/div/div/div[2]/div[2]/div/div[1]/div[2]/div/div[2]/span/span/button').text
except:
business.category = "N/A"
# 提取评论数据
# 评论元素通常有一个role="img"和aria-label属性包含星级和数量
# 这个XPath是针对详细信息面板中的评论元素
reviews_span_xpath = '//span[@role="img" and contains(@aria-label, "stars")]'
try:
# 确保评论元素可见
reviews_element = WebDriverWait(driver, 5).until(
EC.visibility_of_element_located((By.XPATH, reviews_span_xpath))
)
reviews_label = reviews_element.get_attribute("aria-label")
print(f"Reviews Label: {reviews_label}")
match = re.match(r'([\d.]+) stars ([\d,]+) Reviews', reviews_label)
if match:
business.reviews_average = float(match.group(1))
business.reviews_count = int(re.sub(',', '', match.group(2)))
total_scraped_reviews += 1
else:
business.reviews_average = None
business.reviews_count = None
except Exception as e:
print(f"Could not find reviews for this listing or error processing label: {e}")
business.reviews_average = None
business.reviews_count = None
scraped_businesses.append(business)
# 8. 返回列表视图
# 查找并点击返回按钮,通常是一个左箭头图标
# 这个XPath也需要根据实际页面结构调整
back_button_xpath = '//button[@aria-label="Back"]'
try:
WebDriverWait(driver, 5).until(
EC.element_to_be_clickable((By.XPATH, back_button_xpath))
).click()
print("Clicked back button.")
time.sleep(2) # 给页面一些时间返回列表
except Exception as e:
print(f"Could not click back button or no back button found: {e}")
# 如果没有返回按钮,可能需要刷新页面或重新导航,但会影响效率
# 更好的做法是确保找到并点击返回按钮
except Exception as e:
print(f"Error processing listing {index + 1}: {e}")
# 如果某个商家处理失败,尝试继续下一个
# 确保返回列表视图,以防卡在详情页
try:
back_button_xpath = '//button[@aria-label="Back"]'
WebDriverWait(driver, 3).until(
EC.element_to_be_clickable((By.XPATH, back_button_xpath))
).click()
time.sleep(1)
except:
pass # 无法返回,跳过此商家
print(f"\nTotal businesses scraped: {len(scraped_businesses)}")
print(f"Total reviews labels processed: {total_scraped_reviews}")
# 打印抓取结果
for b in scraped_businesses:
print(f"Name: {b.name}, Reviews Average: {b.reviews_average}, Reviews Count: {b.reviews_count}")
finally:
# 关闭浏览器
driver.quit()
print("Browser closed.")
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)抓取谷歌地图这类动态网站需要对Web抓取原理和目标网站结构有深入理解。Playwright和Selenium都是强大的工具,但对于复杂交互和动态内容,Selenium凭借其成熟的WebDriver API和灵活的等待机制,往往能提供更稳定和易于调试的解决方案。通过采用智能等待、健壮的XPath策略和完善的错误处理,可以显著提高谷歌地图评论数据抓取的成功率和可靠性。请记住,网站结构可能随时变化,因此定期维护和更新您的抓取代码是
以上就是谷歌地图评论数据抓取:Playwright 问题解析与Selenium方案优化的详细内容,更多请关注php中文网其它相关文章!
谷歌浏览器Google Chrome是一款可让您更快速、轻松且安全地使用网络的浏览器。Google Chrome的设计超级简洁,使用起来得心应手。这里提供了谷歌浏览器纯净安装包,有需要的小伙伴快来保存下载体验吧!
Copyright 2014-2025 https://www.php.cn/ All Rights Reserved | php.cn | 湘ICP备2023035733号