#!/usr/bin/env python
# -*- coding: utf-8 -*-
import scrapy
import re
title_regex = r'Letra de\s+([a-zA-Z0-9áéíóúñü_,!¡¿?"() ]+)\s-'
empty_lines_regex = r"^\s+$"
tabs_regex = r"^[\n\t]+"
class ConchaPiquerSpider(scrapy.Spider):
name = 'conchitabot'
allowed_domain = ['http://www.coveralia.com']
start_urls = ['http://www.coveralia.com/letras-de/concha-piquer.php']
custom_settings = {
'FEED_EXPORT_ENCODING': 'utf-8',
}
BASE_URL = 'http://www.coveralia.com'
def parse(self, response):
lyric_links = response.css(".lista_uno li a::attr(href)").extract()
for link in lyric_links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_lyric)
lyric_names_raw = response.css(".lista_uno li a::text").extract()
def parse_lyric(self,response):
raw_titles = response.css("h1").extract()
for raw_title in raw_titles:
match = re.search(title_regex, raw_title.encode("utf-8"))
if match:
title = match.group(1)
raw_text = response.css("#HOTWordsTxt::text").extract()
encoded_text = []
single_string = ""
for item_text in raw_text:
single_string = single_string + item_text
lyric = self.clean_lyric(single_string)
text_file = open("./letras/" + title + ".txt", "w")
text_file.write(lyric)
text_file.close()
def clean_lyric(self,dirty_str):
encoded = dirty_str.encode("utf-8")
no_spaces = re.sub(r"^\s+", '', encoded)
no_tabs = re.sub(r"[\n\t]+", '', no_spaces)
return no_tabs
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
import re
def get_sorted_files(Directory):
filenamelist = []
for root, dirs, files in os.walk(Directory):
for name in files:
fullname = os.path.join(root, name)
filenamelist.append(fullname)
return sorted(filenamelist)
text = "<head><meta charset='utf-8'>"
folder = "./letras/"
files = get_sorted_files(folder)
for filename in files:
filebase = re.sub(folder, "", filename)
filebase = re.sub("\..*$", "", filebase)
with open(filename,'r') as f:
text = text + "<h1>" + filebase + "</h1><pre>" + f.read() + "</pre>"
unified = open("unified.html", "w")
unified.write(text)
unified.close()