Scraping code of mclaren.com

Mclaren is a website of sports cars. Here we will scrap Model Name and pdf links:

 import re

import scrapy
from ..items import Manual
from scrapy.spiders import Spider
import urllib.parse
from scrapy.selector import Selector


class Cars_mclaren_comSpider(Spider):
name = 'mclaren.com'
start_urls = [
'https://cars.mclaren.com/en/ownership/service-and-maintenance/owners-handbook']

def parse(self, response, **cb_kwargs):

blocks = response.xpath('//div[@class="column column-sm-2 column-md-6"]').getall()

# Process each block
for block in blocks:
# Wrap the block HTML string in a Selector object
block_selector = Selector(text=block)
pdf_links = block_selector.xpath('.//a[contains(@href, ".pdf")]/@href').getall()

heading = block_selector.xpath('.//h2[@class="heading-03 js-text-transition-element"]/text()').get()
if heading:
heading = heading.strip()
heading = self.clean_model(heading)
heading = [heading]
for model in heading:
for pdf_link in pdf_links:
file_url = response.urljoin(pdf_link)

yield scrapy.Request(
url=file_url,
method="HEAD",
callback=self.check_pdf,
meta={'file_url': file_url, 'heading': model},
dont_filter=True
)

def check_pdf(self, response):
encoded_slug = response.url.split('/')[-1]
decoded_slug = urllib.parse.unquote(encoded_slug)
if any(s in decoded_slug for s in ['Report']):
return

year_mach = re.search(r'\d{4}', decoded_slug)
year = year_mach.group(0) if year_mach else None

file_url = response.meta['file_url']
heading = response.meta['heading']

if response.status == 200: # Only include valid PDFs
yield {
'heading': f"{heading}{year}",
'pdf_links': file_url
}
else:
self.logger.warning(f"Skipping: {file_url} (Status: {response.status})")


def clean_model(self, heading):
if not heading:
return None

# Extract year (MYXX format)
year_match = re.search(r"MY(\d{2})", heading)
year = f"20{year_match.group(1)}" if year_match else None

model = re.sub(r"MY\d{2}\s*?\s*", "", heading).strip()

model = re.sub(r"^,\s*", "", model)
model = re.sub(r"\bMcLaren\b\s*", "", model).strip()
model = re.sub(r"\b(Super Series|Ultimate Series|Sports Series)\b\s*-?\s*", "", model).strip()
model = re.sub(r"SUPER SERIES - 750S", "", model)
model = re.sub(r"^,\s*", "", model)
# Ensure "Speedtail" is not lost
if "Speedtail" in heading:
return "Speedtail"

parts = [part.strip() for part in model.split("/")]
for part in parts:
if year:
return f"{year_match.group(0)} {part} ({year})"

if "P1/P1GTR" in heading:
return "P1/P1GTR"

if year:
return [f"{year_match.group(0)} {part} ({year})" for part in parts]

return ", ".join(parts) if len(parts) > 1 else model

No comments

Powered by Blogger.