ynab-amazon-helper/amazon-scraper.ts
2025-11-04 12:27:02 -05:00

424 lines
16 KiB
TypeScript

import puppeteer from 'puppeteer';
import type { AmazonOrder, AmazonOrderItem } from './types';
export class AmazonScraper {
constructor() {
// No credentials needed - user logs in manually
}
/**
* Helper function to wait/sleep for a specified duration
* @param ms - Milliseconds to wait
*/
private async wait(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
/**
* Scrape Amazon orders for a given date range
* @param startDate - Start date for orders
* @param endDate - End date for orders
* @returns Array of Amazon orders
*/
async scrapeOrders(startDate: Date, endDate: Date): Promise<AmazonOrder[]> {
const browser = await puppeteer.launch({
headless: false, // Set to true in production, false for debugging
userDataDir: './.puppeteer_cache', // Persist session
args: ['--window-size=1920,1080'],
});
try {
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
// Set user agent to avoid detection
await page.setUserAgent(
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
);
console.log('Navigating to Amazon transactions page...');
console.log('Please log in manually if needed. The browser will wait for you.');
// Navigate directly to transactions page - this will redirect to login if needed
await page.goto('https://www.amazon.com/cpe/yourpayments/transactions', {
waitUntil: 'domcontentloaded',
timeout: 60000,
});
// Wait for user to manually log in if needed
console.log('Waiting 15 seconds for you to complete login (if needed)...');
console.log('If you\'re already logged in, the page should load automatically.');
await this.wait(15000);
// Navigate to transactions page again in case login redirected elsewhere
console.log('Ensuring we\'re on the transactions page...');
await page.goto('https://www.amazon.com/cpe/yourpayments/transactions', {
waitUntil: 'domcontentloaded',
timeout: 60000,
});
await this.wait(3000);
// Take a screenshot for debugging
await page.screenshot({ path: 'debug-transactions-page.png', fullPage: true });
console.log('Screenshot saved to debug-transactions-page.png');
// Save HTML for inspection
const html = await page.content();
await Bun.write('debug-transactions-page.html', html);
console.log('HTML saved to debug-transactions-page.html');
// Scrape transactions from the page
const orders: AmazonOrder[] = [];
console.log('Extracting transaction data...');
// First, let's debug what's actually on the page
const debugInfo = await page.evaluate(() => {
// Get all elements that might be transactions
const allDivs = document.querySelectorAll('div');
const classNames = new Set<string>();
const dataTestIds = new Set<string>();
allDivs.forEach(div => {
if (div.className && typeof div.className === 'string') {
div.className.split(' ').forEach(cls => {
if (cls.toLowerCase().includes('transaction') ||
cls.toLowerCase().includes('payment') ||
cls.toLowerCase().includes('order')) {
classNames.add(cls);
}
});
}
const testId = div.getAttribute('data-testid');
if (testId) {
dataTestIds.add(testId);
}
});
return {
totalDivs: allDivs.length,
relevantClasses: Array.from(classNames),
dataTestIds: Array.from(dataTestIds),
bodyText: document.body.innerText.substring(0, 500),
};
});
console.log('Debug Info:');
console.log('- Total divs on page:', debugInfo.totalDivs);
console.log('- Relevant class names found:', debugInfo.relevantClasses);
console.log('- Data-testid attributes found:', debugInfo.dataTestIds);
console.log('- First 500 chars of page:', debugInfo.bodyText);
// Extract transaction information from the page
const transactions = await page.evaluate(() => {
const extracted: any[] = [];
// Find all date containers first
const dateContainers = document.querySelectorAll('.apx-transaction-date-container');
console.log('Found date containers:', dateContainers.length);
dateContainers.forEach((dateContainer) => {
const dateText = dateContainer.textContent?.trim() || '';
// Find the next sibling that contains transaction line items
let currentElement = dateContainer.nextElementSibling;
while (currentElement) {
// Stop if we hit the next date container
if (currentElement.classList.contains('apx-transaction-date-container')) {
break;
}
// Find all transaction line items within this section
const transactionRows = currentElement.querySelectorAll('.apx-transactions-line-item-component-container');
transactionRows.forEach((row) => {
try {
// Extract amount - look for negative dollar amounts
const allText = row.textContent || '';
const amountMatch = allText.match(/-?\$\s*([\d,]+\.\d{2})/);
const total = amountMatch ? Math.abs(parseFloat(amountMatch[1].replace(/,/g, ''))) : 0;
// Extract order link and ID
const orderLink = row.querySelector('a[href*="orderID"]');
const href = orderLink?.getAttribute('href') || '';
const orderIdMatch = href.match(/orderID=([A-Z0-9-]+)/i);
const orderId = orderIdMatch ? orderIdMatch[1] : '';
// Extract merchant/description - the last span usually contains it
const spans = row.querySelectorAll('span.a-size-base');
let description = 'Amazon Purchase';
if (spans.length > 0) {
// The last span usually has the merchant name
const merchantSpan = spans[spans.length - 1];
description = merchantSpan?.textContent?.trim() || 'Amazon Purchase';
}
console.log(`Transaction: Date="${dateText}", Amount=$${total}, Desc="${description}", Order=${orderId}`);
if (dateText && total > 0) {
extracted.push({
orderId: orderId || `txn_${Date.now()}_${Math.random().toString(36).substring(7)}`,
dateText,
total,
description,
orderUrl: orderLink && href ? (href.startsWith('http') ? href : `https://www.amazon.com${href}`) : '',
});
}
} catch (err) {
console.error('Error extracting transaction:', err);
}
});
currentElement = currentElement.nextElementSibling;
}
});
return extracted;
});
console.log(`Extracted ${transactions.length} transactions`);
if (transactions.length > 0) {
console.log('First 3 transactions (raw data):');
transactions.slice(0, 3).forEach((txn, i) => {
console.log(` ${i + 1}. Date: "${txn.dateText}", Amount: $${txn.total}, Desc: "${txn.description}"`);
});
} else {
console.log('WARNING: No transactions were extracted from the page!');
}
// Process and filter transactions by date
for (const txn of transactions) {
try {
const txnDate = new Date(txn.dateText);
// Check if date is valid and within range
if (!isNaN(txnDate.getTime()) && txnDate >= startDate && txnDate <= endDate) {
// Check if description suggests it's an Amazon order
const isAmazonOrder = txn.description.toLowerCase().includes('amazon') ||
txn.description.toLowerCase().includes('order') ||
txn.orderId;
if (isAmazonOrder) {
orders.push({
orderId: txn.orderId,
orderDate: txnDate,
total: txn.total,
items: [{
title: txn.description || 'Amazon Purchase',
price: txn.total,
quantity: 1,
}],
orderUrl: txn.orderUrl || `https://www.amazon.com/cpe/yourpayments/transactions`,
});
}
}
} catch (err) {
console.error('Error processing transaction:', err);
}
}
console.log(`Found ${orders.length} Amazon orders within date range`);
return orders;
} catch (error) {
console.error('Error scraping Amazon orders:', error);
throw error;
} finally {
await browser.close();
}
}
/**
* Fetch product details for matched orders
* @param orders - Array of Amazon orders that need product details
* @returns Updated orders with product information
*/
async fetchProductDetails(orders: AmazonOrder[]): Promise<AmazonOrder[]> {
const browser = await puppeteer.launch({
headless: false,
userDataDir: './.puppeteer_cache',
args: ['--window-size=1920,1080'],
});
try {
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
console.log(`Fetching product details for ${orders.length} orders...`);
for (let i = 0; i < orders.length; i++) {
const order = orders[i];
// Skip if no order URL or no valid order ID
if (!order.orderUrl || !order.orderId || order.orderId.startsWith('txn_')) {
console.log(`Skipping order ${i + 1}/${orders.length} - no valid order ID`);
continue;
}
try {
console.log(`Fetching details for order ${i + 1}/${orders.length}: ${order.orderId}`);
// Navigate to order details page
await page.goto(order.orderUrl, {
waitUntil: 'domcontentloaded',
timeout: 30000,
});
await this.wait(2000);
// Save debug files for this order page
const safeOrderId = order.orderId.replace(/[^a-zA-Z0-9-]/g, '_');
await page.screenshot({ path: `debug-order-${safeOrderId}.png`, fullPage: true });
const html = await page.content();
await Bun.write(`debug-order-${safeOrderId}.html`, html);
console.log(` Debug files saved: debug-order-${safeOrderId}.{png,html}`);
// Extract product information from the order details page
const productDetails = await page.evaluate(() => {
const items: any[] = [];
// Try multiple selectors for product items
const productElements = document.querySelectorAll(
'[data-component=purchasedItemsRightGrid]'
);
productElements.forEach((elem) => {
try {
// Get product title
const titleElem = elem.querySelector('[data-component=itemTitle]');
const title = titleElem?.textContent?.trim() || '';
// Get price if available
const priceElem = elem.querySelector('[data-component=unitPrice]');
const priceText = priceElem?.textContent?.trim() || '';
const priceMatch = priceText.match(/\$\s*([\d,]+\.\d{2})/);
const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, '')) : 0;
// Get quantity if available
const quantity = 1;
if (title && title.length > 5) {
items.push({ title, price, quantity });
}
} catch (err) {
console.error('Error extracting product:', err);
}
});
// If no products found, try alternative selectors
if (items.length === 0) {
const altProducts = document.querySelectorAll('.a-box-group');
altProducts.forEach((box) => {
const titleElem = box.querySelector('a.a-link-normal, .a-text-bold');
const title = titleElem?.textContent?.trim() || '';
if (title && title.length > 10 && !title.includes('Track package') && !title.includes('View order')) {
items.push({ title, price: 0, quantity: 1 });
}
});
}
return items;
});
if (productDetails.length > 0) {
orders[i].items = productDetails;
console.log(` Found ${productDetails.length} products:`, productDetails.map(p => p.title).join(', '));
} else {
console.log(` No products found, keeping merchant name`);
}
// Small delay between requests to avoid rate limiting
await this.wait(1000);
} catch (error) {
console.error(`Error fetching order details for ${order.orderId}:`, error);
// Keep the default items with merchant name
}
}
console.log('Finished fetching product details');
return orders;
} catch (error) {
console.error('Error in fetchProductDetails:', error);
return orders;
} finally {
await browser.close();
}
}
/**
* Get order details for a specific order ID
* @param orderId - Amazon order ID
* @returns Order details
*/
async getOrderDetails(orderId: string): Promise<AmazonOrder | null> {
const browser = await puppeteer.launch({
headless: false,
userDataDir: './.puppeteer_cache',
});
try {
const page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
);
const orderUrl = `https://www.amazon.com/gp/your-account/order-details?orderID=${orderId}`;
await page.goto(orderUrl, { waitUntil: 'networkidle2' });
// Extract detailed order information
const orderDetails = await page.evaluate(() => {
const orderDateElement = document.querySelector('.order-date-invoice-item');
const orderDateText = orderDateElement?.textContent?.trim() || '';
const totalElement = document.querySelector('.grand-total-price');
const totalText = totalElement?.textContent?.trim().replace(/[^0-9.]/g, '') || '0';
const total = parseFloat(totalText);
const items: any[] = [];
const itemElements = document.querySelectorAll('.product');
itemElements.forEach((item) => {
const titleElement = item.querySelector('.product-title');
const title = titleElement?.textContent?.trim() || '';
const priceElement = item.querySelector('.product-price');
const priceText = priceElement?.textContent?.trim().replace(/[^0-9.]/g, '') || '0';
const price = parseFloat(priceText);
const quantityElement = item.querySelector('.quantity');
const quantityText = quantityElement?.textContent?.trim().replace(/[^0-9]/g, '') || '1';
const quantity = parseInt(quantityText);
if (title) {
items.push({ title, price, quantity });
}
});
return {
orderDateText,
total,
items,
};
});
return {
orderId,
orderDate: new Date(orderDetails.orderDateText),
total: orderDetails.total,
items: orderDetails.items,
orderUrl,
};
} catch (error) {
console.error(`Error getting details for order ${orderId}:`, error);
return null;
} finally {
await browser.close();
}
}
}