Home > Unlabelled > Scraping code of mclaren.com

Scraping code of mclaren.com

Mclaren is a website of sports cars. Here we will scrap Model Name and pdf links:

import re

import scrapy
from ..items import Manual
from scrapy.spiders import Spider
import urllib.parse
from scrapy.selector import Selector


class Cars_mclaren_comSpider(Spider):
    name = 'mclaren.com'
    start_urls = [
        'https://cars.mclaren.com/en/ownership/service-and-maintenance/owners-handbook']

    def parse(self, response, **cb_kwargs):

        blocks = response.xpath('//div[@class="column column-sm-2 column-md-6"]').getall()

        # Process each block
        for block in blocks:
            # Wrap the block HTML string in a Selector object
            block_selector = Selector(text=block)
            pdf_links = block_selector.xpath('.//a[contains(@href, ".pdf")]/@href').getall()

            heading = block_selector.xpath('.//h2[@class="heading-03 js-text-transition-element"]/text()').get()
            if heading:
                heading = heading.strip()
                heading = self.clean_model(heading)
                heading = [heading]
                for model in heading:
                    for pdf_link in pdf_links:
                        file_url = response.urljoin(pdf_link)

                        yield scrapy.Request(
                            url=file_url,
                            method="HEAD",
                            callback=self.check_pdf,
                            meta={'file_url': file_url, 'heading': model},
                            dont_filter=True
                        )

    def check_pdf(self, response):
        encoded_slug = response.url.split('/')[-1]
        decoded_slug = urllib.parse.unquote(encoded_slug)
        if any(s in decoded_slug for s in ['Report']):
            return

        year_mach = re.search(r'\d{4}', decoded_slug)
        year = year_mach.group(0) if year_mach else None

        file_url = response.meta['file_url']
        heading = response.meta['heading']

        if response.status == 200:  # Only include valid PDFs
            yield {
                'heading': f"{heading}{year}",
                'pdf_links': file_url
            }
        else:
            self.logger.warning(f"Skipping: {file_url} (Status: {response.status})")


    def clean_model(self, heading):
        if not heading:
            return None

        # Extract year (MYXX format)
        year_match = re.search(r"MY(\d{2})", heading)
        year = f"20{year_match.group(1)}" if year_match else None

        model = re.sub(r"MY\d{2}\s*?\s*", "", heading).strip()

        model = re.sub(r"^,\s*", "", model)
        model = re.sub(r"\bMcLaren\b\s*", "", model).strip()
        model = re.sub(r"\b(Super Series|Ultimate Series|Sports Series)\b\s*-?\s*", "", model).strip()
        model = re.sub(r"SUPER SERIES -  750S", "", model)
        model = re.sub(r"^,\s*", "", model)
        # Ensure "Speedtail" is not lost
        if "Speedtail" in heading:
            return "Speedtail"

        parts = [part.strip() for part in model.split("/")]
        for part in parts:
            if year:
                return f"{year_match.group(0)} {part} ({year})"

        if "P1/P1GTR" in heading:
            return "P1/P1GTR"

        if year:
            return [f"{year_match.group(0)} {part} ({year})" for part in parts]

        return ", ".join(parts) if len(parts) > 1 else model

No comments

Subscribe to: Post Comments ( Atom )