424 lines
16 KiB
TypeScript
424 lines
16 KiB
TypeScript
import puppeteer from 'puppeteer';
|
|
import type { AmazonOrder, AmazonOrderItem } from './types';
|
|
|
|
export class AmazonScraper {
|
|
constructor() {
|
|
// No credentials needed - user logs in manually
|
|
}
|
|
|
|
/**
|
|
* Helper function to wait/sleep for a specified duration
|
|
* @param ms - Milliseconds to wait
|
|
*/
|
|
private async wait(ms: number): Promise<void> {
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
}
|
|
|
|
/**
|
|
* Scrape Amazon orders for a given date range
|
|
* @param startDate - Start date for orders
|
|
* @param endDate - End date for orders
|
|
* @returns Array of Amazon orders
|
|
*/
|
|
async scrapeOrders(startDate: Date, endDate: Date): Promise<AmazonOrder[]> {
|
|
const browser = await puppeteer.launch({
|
|
headless: false, // Set to true in production, false for debugging
|
|
userDataDir: './.puppeteer_cache', // Persist session
|
|
args: ['--window-size=1920,1080'],
|
|
});
|
|
|
|
try {
|
|
const page = await browser.newPage();
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
|
|
// Set user agent to avoid detection
|
|
await page.setUserAgent(
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
);
|
|
|
|
console.log('Navigating to Amazon transactions page...');
|
|
console.log('Please log in manually if needed. The browser will wait for you.');
|
|
|
|
// Navigate directly to transactions page - this will redirect to login if needed
|
|
await page.goto('https://www.amazon.com/cpe/yourpayments/transactions', {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 60000,
|
|
});
|
|
|
|
// Wait for user to manually log in if needed
|
|
console.log('Waiting 15 seconds for you to complete login (if needed)...');
|
|
console.log('If you\'re already logged in, the page should load automatically.');
|
|
await this.wait(15000);
|
|
|
|
// Navigate to transactions page again in case login redirected elsewhere
|
|
console.log('Ensuring we\'re on the transactions page...');
|
|
await page.goto('https://www.amazon.com/cpe/yourpayments/transactions', {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 60000,
|
|
});
|
|
|
|
await this.wait(3000);
|
|
|
|
// Take a screenshot for debugging
|
|
await page.screenshot({ path: 'debug-transactions-page.png', fullPage: true });
|
|
console.log('Screenshot saved to debug-transactions-page.png');
|
|
|
|
// Save HTML for inspection
|
|
const html = await page.content();
|
|
await Bun.write('debug-transactions-page.html', html);
|
|
console.log('HTML saved to debug-transactions-page.html');
|
|
|
|
// Scrape transactions from the page
|
|
const orders: AmazonOrder[] = [];
|
|
|
|
console.log('Extracting transaction data...');
|
|
|
|
// First, let's debug what's actually on the page
|
|
const debugInfo = await page.evaluate(() => {
|
|
// Get all elements that might be transactions
|
|
const allDivs = document.querySelectorAll('div');
|
|
const classNames = new Set<string>();
|
|
const dataTestIds = new Set<string>();
|
|
|
|
allDivs.forEach(div => {
|
|
if (div.className && typeof div.className === 'string') {
|
|
div.className.split(' ').forEach(cls => {
|
|
if (cls.toLowerCase().includes('transaction') ||
|
|
cls.toLowerCase().includes('payment') ||
|
|
cls.toLowerCase().includes('order')) {
|
|
classNames.add(cls);
|
|
}
|
|
});
|
|
}
|
|
const testId = div.getAttribute('data-testid');
|
|
if (testId) {
|
|
dataTestIds.add(testId);
|
|
}
|
|
});
|
|
|
|
return {
|
|
totalDivs: allDivs.length,
|
|
relevantClasses: Array.from(classNames),
|
|
dataTestIds: Array.from(dataTestIds),
|
|
bodyText: document.body.innerText.substring(0, 500),
|
|
};
|
|
});
|
|
|
|
console.log('Debug Info:');
|
|
console.log('- Total divs on page:', debugInfo.totalDivs);
|
|
console.log('- Relevant class names found:', debugInfo.relevantClasses);
|
|
console.log('- Data-testid attributes found:', debugInfo.dataTestIds);
|
|
console.log('- First 500 chars of page:', debugInfo.bodyText);
|
|
|
|
// Extract transaction information from the page
|
|
const transactions = await page.evaluate(() => {
|
|
const extracted: any[] = [];
|
|
|
|
// Find all date containers first
|
|
const dateContainers = document.querySelectorAll('.apx-transaction-date-container');
|
|
console.log('Found date containers:', dateContainers.length);
|
|
|
|
dateContainers.forEach((dateContainer) => {
|
|
const dateText = dateContainer.textContent?.trim() || '';
|
|
|
|
// Find the next sibling that contains transaction line items
|
|
let currentElement = dateContainer.nextElementSibling;
|
|
|
|
while (currentElement) {
|
|
// Stop if we hit the next date container
|
|
if (currentElement.classList.contains('apx-transaction-date-container')) {
|
|
break;
|
|
}
|
|
|
|
// Find all transaction line items within this section
|
|
const transactionRows = currentElement.querySelectorAll('.apx-transactions-line-item-component-container');
|
|
|
|
transactionRows.forEach((row) => {
|
|
try {
|
|
// Extract amount - look for negative dollar amounts
|
|
const allText = row.textContent || '';
|
|
const amountMatch = allText.match(/-?\$\s*([\d,]+\.\d{2})/);
|
|
const total = amountMatch ? Math.abs(parseFloat(amountMatch[1].replace(/,/g, ''))) : 0;
|
|
|
|
// Extract order link and ID
|
|
const orderLink = row.querySelector('a[href*="orderID"]');
|
|
const href = orderLink?.getAttribute('href') || '';
|
|
const orderIdMatch = href.match(/orderID=([A-Z0-9-]+)/i);
|
|
const orderId = orderIdMatch ? orderIdMatch[1] : '';
|
|
|
|
// Extract merchant/description - the last span usually contains it
|
|
const spans = row.querySelectorAll('span.a-size-base');
|
|
let description = 'Amazon Purchase';
|
|
if (spans.length > 0) {
|
|
// The last span usually has the merchant name
|
|
const merchantSpan = spans[spans.length - 1];
|
|
description = merchantSpan?.textContent?.trim() || 'Amazon Purchase';
|
|
}
|
|
|
|
console.log(`Transaction: Date="${dateText}", Amount=$${total}, Desc="${description}", Order=${orderId}`);
|
|
|
|
if (dateText && total > 0) {
|
|
extracted.push({
|
|
orderId: orderId || `txn_${Date.now()}_${Math.random().toString(36).substring(7)}`,
|
|
dateText,
|
|
total,
|
|
description,
|
|
orderUrl: orderLink && href ? (href.startsWith('http') ? href : `https://www.amazon.com${href}`) : '',
|
|
});
|
|
}
|
|
} catch (err) {
|
|
console.error('Error extracting transaction:', err);
|
|
}
|
|
});
|
|
|
|
currentElement = currentElement.nextElementSibling;
|
|
}
|
|
});
|
|
|
|
return extracted;
|
|
});
|
|
|
|
console.log(`Extracted ${transactions.length} transactions`);
|
|
|
|
if (transactions.length > 0) {
|
|
console.log('First 3 transactions (raw data):');
|
|
transactions.slice(0, 3).forEach((txn, i) => {
|
|
console.log(` ${i + 1}. Date: "${txn.dateText}", Amount: $${txn.total}, Desc: "${txn.description}"`);
|
|
});
|
|
} else {
|
|
console.log('WARNING: No transactions were extracted from the page!');
|
|
}
|
|
|
|
// Process and filter transactions by date
|
|
for (const txn of transactions) {
|
|
try {
|
|
const txnDate = new Date(txn.dateText);
|
|
|
|
// Check if date is valid and within range
|
|
if (!isNaN(txnDate.getTime()) && txnDate >= startDate && txnDate <= endDate) {
|
|
// Check if description suggests it's an Amazon order
|
|
const isAmazonOrder = txn.description.toLowerCase().includes('amazon') ||
|
|
txn.description.toLowerCase().includes('order') ||
|
|
txn.orderId;
|
|
|
|
if (isAmazonOrder) {
|
|
orders.push({
|
|
orderId: txn.orderId,
|
|
orderDate: txnDate,
|
|
total: txn.total,
|
|
items: [{
|
|
title: txn.description || 'Amazon Purchase',
|
|
price: txn.total,
|
|
quantity: 1,
|
|
}],
|
|
orderUrl: txn.orderUrl || `https://www.amazon.com/cpe/yourpayments/transactions`,
|
|
});
|
|
}
|
|
}
|
|
} catch (err) {
|
|
console.error('Error processing transaction:', err);
|
|
}
|
|
}
|
|
|
|
console.log(`Found ${orders.length} Amazon orders within date range`);
|
|
|
|
return orders;
|
|
} catch (error) {
|
|
console.error('Error scraping Amazon orders:', error);
|
|
throw error;
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Fetch product details for matched orders
|
|
* @param orders - Array of Amazon orders that need product details
|
|
* @returns Updated orders with product information
|
|
*/
|
|
async fetchProductDetails(orders: AmazonOrder[]): Promise<AmazonOrder[]> {
|
|
const browser = await puppeteer.launch({
|
|
headless: false,
|
|
userDataDir: './.puppeteer_cache',
|
|
args: ['--window-size=1920,1080'],
|
|
});
|
|
|
|
try {
|
|
const page = await browser.newPage();
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
|
|
console.log(`Fetching product details for ${orders.length} orders...`);
|
|
|
|
for (let i = 0; i < orders.length; i++) {
|
|
const order = orders[i];
|
|
|
|
// Skip if no order URL or no valid order ID
|
|
if (!order.orderUrl || !order.orderId || order.orderId.startsWith('txn_')) {
|
|
console.log(`Skipping order ${i + 1}/${orders.length} - no valid order ID`);
|
|
continue;
|
|
}
|
|
|
|
try {
|
|
console.log(`Fetching details for order ${i + 1}/${orders.length}: ${order.orderId}`);
|
|
|
|
// Navigate to order details page
|
|
await page.goto(order.orderUrl, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 30000,
|
|
});
|
|
|
|
await this.wait(2000);
|
|
|
|
// Save debug files for this order page
|
|
const safeOrderId = order.orderId.replace(/[^a-zA-Z0-9-]/g, '_');
|
|
await page.screenshot({ path: `debug-order-${safeOrderId}.png`, fullPage: true });
|
|
const html = await page.content();
|
|
await Bun.write(`debug-order-${safeOrderId}.html`, html);
|
|
console.log(` Debug files saved: debug-order-${safeOrderId}.{png,html}`);
|
|
|
|
// Extract product information from the order details page
|
|
const productDetails = await page.evaluate(() => {
|
|
const items: any[] = [];
|
|
|
|
// Try multiple selectors for product items
|
|
const productElements = document.querySelectorAll(
|
|
'[data-component=purchasedItemsRightGrid]'
|
|
);
|
|
|
|
productElements.forEach((elem) => {
|
|
try {
|
|
// Get product title
|
|
const titleElem = elem.querySelector('[data-component=itemTitle]');
|
|
const title = titleElem?.textContent?.trim() || '';
|
|
|
|
// Get price if available
|
|
const priceElem = elem.querySelector('[data-component=unitPrice]');
|
|
const priceText = priceElem?.textContent?.trim() || '';
|
|
const priceMatch = priceText.match(/\$\s*([\d,]+\.\d{2})/);
|
|
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, '')) : 0;
|
|
|
|
// Get quantity if available
|
|
const quantity = 1;
|
|
|
|
if (title && title.length > 5) {
|
|
items.push({ title, price, quantity });
|
|
}
|
|
} catch (err) {
|
|
console.error('Error extracting product:', err);
|
|
}
|
|
});
|
|
|
|
// If no products found, try alternative selectors
|
|
if (items.length === 0) {
|
|
const altProducts = document.querySelectorAll('.a-box-group');
|
|
altProducts.forEach((box) => {
|
|
const titleElem = box.querySelector('a.a-link-normal, .a-text-bold');
|
|
const title = titleElem?.textContent?.trim() || '';
|
|
|
|
if (title && title.length > 10 && !title.includes('Track package') && !title.includes('View order')) {
|
|
items.push({ title, price: 0, quantity: 1 });
|
|
}
|
|
});
|
|
}
|
|
|
|
return items;
|
|
});
|
|
|
|
if (productDetails.length > 0) {
|
|
orders[i].items = productDetails;
|
|
console.log(` Found ${productDetails.length} products:`, productDetails.map(p => p.title).join(', '));
|
|
} else {
|
|
console.log(` No products found, keeping merchant name`);
|
|
}
|
|
|
|
// Small delay between requests to avoid rate limiting
|
|
await this.wait(1000);
|
|
|
|
} catch (error) {
|
|
console.error(`Error fetching order details for ${order.orderId}:`, error);
|
|
// Keep the default items with merchant name
|
|
}
|
|
}
|
|
|
|
console.log('Finished fetching product details');
|
|
return orders;
|
|
|
|
} catch (error) {
|
|
console.error('Error in fetchProductDetails:', error);
|
|
return orders;
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get order details for a specific order ID
|
|
* @param orderId - Amazon order ID
|
|
* @returns Order details
|
|
*/
|
|
async getOrderDetails(orderId: string): Promise<AmazonOrder | null> {
|
|
const browser = await puppeteer.launch({
|
|
headless: false,
|
|
userDataDir: './.puppeteer_cache',
|
|
});
|
|
|
|
try {
|
|
const page = await browser.newPage();
|
|
await page.setUserAgent(
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
);
|
|
|
|
const orderUrl = `https://www.amazon.com/gp/your-account/order-details?orderID=${orderId}`;
|
|
await page.goto(orderUrl, { waitUntil: 'networkidle2' });
|
|
|
|
// Extract detailed order information
|
|
const orderDetails = await page.evaluate(() => {
|
|
const orderDateElement = document.querySelector('.order-date-invoice-item');
|
|
const orderDateText = orderDateElement?.textContent?.trim() || '';
|
|
|
|
const totalElement = document.querySelector('.grand-total-price');
|
|
const totalText = totalElement?.textContent?.trim().replace(/[^0-9.]/g, '') || '0';
|
|
const total = parseFloat(totalText);
|
|
|
|
const items: any[] = [];
|
|
const itemElements = document.querySelectorAll('.product');
|
|
|
|
itemElements.forEach((item) => {
|
|
const titleElement = item.querySelector('.product-title');
|
|
const title = titleElement?.textContent?.trim() || '';
|
|
|
|
const priceElement = item.querySelector('.product-price');
|
|
const priceText = priceElement?.textContent?.trim().replace(/[^0-9.]/g, '') || '0';
|
|
const price = parseFloat(priceText);
|
|
|
|
const quantityElement = item.querySelector('.quantity');
|
|
const quantityText = quantityElement?.textContent?.trim().replace(/[^0-9]/g, '') || '1';
|
|
const quantity = parseInt(quantityText);
|
|
|
|
if (title) {
|
|
items.push({ title, price, quantity });
|
|
}
|
|
});
|
|
|
|
return {
|
|
orderDateText,
|
|
total,
|
|
items,
|
|
};
|
|
});
|
|
|
|
return {
|
|
orderId,
|
|
orderDate: new Date(orderDetails.orderDateText),
|
|
total: orderDetails.total,
|
|
items: orderDetails.items,
|
|
orderUrl,
|
|
};
|
|
} catch (error) {
|
|
console.error(`Error getting details for order ${orderId}:`, error);
|
|
return null;
|
|
} finally {
|
|
await browser.close();
|
|
}
|
|
}
|
|
}
|