Mclaren is a website of sports cars. Here we will scrap Model Name and pdf links:
import re
import scrapy
from ..items import Manual
from scrapy.spiders import Spider
import urllib.parse
from scrapy.selector import Selector
class Cars_mclaren_comSpider(Spider):
name = 'mclaren.com'
start_urls = [
'https://cars.mclaren.com/en/ownership/service-and-maintenance/owners-handbook']
def parse(self, response, **cb_kwargs):
blocks = response.xpath('//div[@class="column column-sm-2 column-md-6"]').getall()
# Process each block
for block in blocks:
# Wrap the block HTML string in a Selector object
block_selector = Selector(text=block)
pdf_links = block_selector.xpath('.//a[contains(@href, ".pdf")]/@href').getall()
heading = block_selector.xpath('.//h2[@class="heading-03 js-text-transition-element"]/text()').get()
if heading:
heading = heading.strip()
heading = self.clean_model(heading)
heading = [heading]
for model in heading:
for pdf_link in pdf_links:
file_url = response.urljoin(pdf_link)
yield scrapy.Request(
url=file_url,
method="HEAD",
callback=self.check_pdf,
meta={'file_url': file_url, 'heading': model},
dont_filter=True
)
def check_pdf(self, response):
encoded_slug = response.url.split('/')[-1]
decoded_slug = urllib.parse.unquote(encoded_slug)
if any(s in decoded_slug for s in ['Report']):
return
year_mach = re.search(r'\d{4}', decoded_slug)
year = year_mach.group(0) if year_mach else None
file_url = response.meta['file_url']
heading = response.meta['heading']
if response.status == 200: # Only include valid PDFs
yield {
'heading': f"{heading}{year}",
'pdf_links': file_url
}
else:
self.logger.warning(f"Skipping: {file_url} (Status: {response.status})")
def clean_model(self, heading):
if not heading:
return None
# Extract year (MYXX format)
year_match = re.search(r"MY(\d{2})", heading)
year = f"20{year_match.group(1)}" if year_match else None
model = re.sub(r"MY\d{2}\s*?\s*", "", heading).strip()
model = re.sub(r"^,\s*", "", model)
model = re.sub(r"\bMcLaren\b\s*", "", model).strip()
model = re.sub(r"\b(Super Series|Ultimate Series|Sports Series)\b\s*-?\s*", "", model).strip()
model = re.sub(r"SUPER SERIES - 750S", "", model)
model = re.sub(r"^,\s*", "", model)
# Ensure "Speedtail" is not lost
if "Speedtail" in heading:
return "Speedtail"
parts = [part.strip() for part in model.split("/")]
for part in parts:
if year:
return f"{year_match.group(0)} {part} ({year})"
if "P1/P1GTR" in heading:
return "P1/P1GTR"
if year:
return [f"{year_match.group(0)} {part} ({year})" for part in parts]
return ", ".join(parts) if len(parts) > 1 else model
Post a Comment