Categorías
Programación

Web scraper with Scrapy

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import scrapy
import re

title_regex       = r'Letra de\s+([a-zA-Z0-9áéíóúñü_,!¡¿?"() ]+)\s-'
empty_lines_regex = r"^\s+$"
tabs_regex        = r"^[\n\t]+"

class ConchaPiquerSpider(scrapy.Spider):
    name = 'conchitabot'
    allowed_domain = ['http://www.coveralia.com']
    start_urls = ['http://www.coveralia.com/letras-de/concha-piquer.php']
    custom_settings = {
        'FEED_EXPORT_ENCODING': 'utf-8',
    }
    BASE_URL = 'http://www.coveralia.com'
    def parse(self, response):
        lyric_links = response.css(".lista_uno li a::attr(href)").extract()
        for link in lyric_links:
            absolute_url = self.BASE_URL + link
            yield scrapy.Request(absolute_url, callback=self.parse_lyric)
        lyric_names_raw = response.css(".lista_uno li a::text").extract()


    def parse_lyric(self,response):
        raw_titles = response.css("h1").extract()
        for raw_title in raw_titles:
            match = re.search(title_regex, raw_title.encode("utf-8"))
            if match:
                title = match.group(1)
        raw_text = response.css("#HOTWordsTxt::text").extract()
        encoded_text = []
        single_string = ""
        for item_text in raw_text:
            single_string = single_string + item_text

        lyric = self.clean_lyric(single_string)

        text_file = open("./letras/" + title + ".txt", "w")
        text_file.write(lyric)
        text_file.close()

    def clean_lyric(self,dirty_str):
        encoded = dirty_str.encode("utf-8")
        no_spaces = re.sub(r"^\s+", '', encoded)
        no_tabs = re.sub(r"[\n\t]+", '', no_spaces)
        return no_tabs
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import re

def get_sorted_files(Directory):
    filenamelist = []
    for root, dirs, files in os.walk(Directory):
        for name in files:
            fullname = os.path.join(root, name)
            filenamelist.append(fullname)
    return sorted(filenamelist)

text = "<head><meta charset='utf-8'>"
folder = "./letras/"
files = get_sorted_files(folder)
for filename in files:
    filebase = re.sub(folder, "", filename)
    filebase = re.sub("\..*$", "", filebase)
    with open(filename,'r') as f:
        text = text + "<h1>" + filebase + "</h1><pre>" + f.read() + "</pre>"

unified = open("unified.html", "w")
unified.write(text)
unified.close()

Deja una respuesta

Tu dirección de correo electrónico no será publicada. Los campos obligatorios están marcados con *

Este sitio usa Akismet para reducir el spam. Aprende cómo se procesan los datos de tus comentarios.