- ver 1.2.47 :

- corretto errore di modifica scheda
- aggiunto scraping (fase 1)
This commit is contained in:
Surya Paolo
2025-05-16 10:26:55 +02:00
parent 1da0e0f4b5
commit 7e50299854
9 changed files with 194 additions and 38 deletions

View File

@@ -0,0 +1,125 @@
import axios from 'axios';
import cheerio from 'cheerio';
class AmazonBookScraper {
constructor() {
this.baseUrl = 'https://www.amazon.it/dp/';
}
async fetchPage(isbn) {
const url = `${this.baseUrl}${isbn}`;
try {
const { data } = await axios.get(url, {
headers: {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' +
'AppleWebKit/537.36 (KHTML, like Gecko) ' +
'Chrome/113.0.0.0 Safari/537.36',
// altri header se necessario
},
});
return data;
} catch (err) {
console.error(`Errore fetching ISBN ${isbn}:`, err.message);
return null;
}
}
extractData(html) {
const $ = cheerio.load(html);
// Titolo
let title = $('#productTitle').text().trim() || null;
// Sottotitolo (Amazon spesso lo mette in #productSubtitle o nel titolo, proveremo)
let subtitle = $('#productSubtitle').text().trim() || null;
// Numero pagine, formato, edizione
// Questi dati spesso sono nella tabella dettagli prodotto con id #detailBullets_feature_div o #productDetailsTable
// Proviamo a estrarre da #detailBullets_feature_div
let pages = null;
let format = null;
let edition = null;
$('#detailBullets_feature_div li').each((i, el) => {
const label = $(el).find('span.a-text-bold').text().trim().toLowerCase();
const value = $(el).find('span').last().text().trim();
if (label.includes('pagine') || label.includes('pagine stampate')) {
pages = value;
} else if (label.includes('formato')) {
format = value;
} else if (label.includes('edizione')) {
edition = value;
}
});
// fallback su #productDetailsTable (altro possibile layout)
if (!pages || !format || !edition) {
$('#productDetailsTable .content tr').each((i, el) => {
const label = $(el).find('th').text().trim().toLowerCase();
const value = $(el).find('td').text().trim();
if (!pages && (label.includes('pagine') || label.includes('pagine stampate'))) {
pages = value;
} else if (!format && label.includes('formato')) {
format = value;
} else if (!edition && label.includes('edizione')) {
edition = value;
}
});
}
return { title, subtitle, pages, format, edition };
}
async scrapeISBN(isbn) {
const html = await this.fetchPage(isbn);
if (!html) return null;
const data = this.extractData(html);
return data;
}
async scrapeMultiple(isbnList) {
const results = [];
for (const isbn of isbnList) {
console.log(`Scraping ISBN: ${isbn}`);
const data = await this.scrapeISBN(isbn);
results.push({ isbn, ...data });
// Per evitare blocchi, metti una pausa (es. 2 secondi)
await new Promise((r) => setTimeout(r, 2000));
}
return results;
}
}
export async function ScraperDataAmazon(idapp, options) {
const scraper = new AmazonBookScraper();
const isbn = options.isbn;
try {
const data = await scraper.scrapeISBN(isbn);
console.log(data);
return data;
} catch (e) {
console.error(e);
return res.status(400).send({ code: server_constants.RIS_CODE_ERR, msg: '' });
}
}
export async function ScraperMultipleDataAmazon(idapp, options) {
const scraper = new AmazonBookScraper();
const isbnList = ['8850224248']; // metti i tuoi ISBN qui
try {
const books = await scraper.scrapeMultiple(isbnList);
console.log(books);
} catch (e) {
console.error(e);
return res.status(400).send({ code: server_constants.RIS_CODE_ERR, msg: '' });
}
}
export default AmazonBookScraper;