import puppeteer from 'puppeteer'; import type { AmazonOrder, AmazonOrderItem } from './types'; export class AmazonScraper { constructor() { // No credentials needed - user logs in manually } /** * Helper function to wait/sleep for a specified duration * @param ms - Milliseconds to wait */ private async wait(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)); } /** * Scrape Amazon orders for a given date range * @param startDate - Start date for orders * @param endDate - End date for orders * @returns Array of Amazon orders */ async scrapeOrders(startDate: Date, endDate: Date): Promise { const browser = await puppeteer.launch({ headless: false, // Set to true in production, false for debugging userDataDir: './.puppeteer_cache', // Persist session args: ['--window-size=1920,1080'], }); try { const page = await browser.newPage(); await page.setViewport({ width: 1920, height: 1080 }); // Set user agent to avoid detection await page.setUserAgent( 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' ); console.log('Navigating to Amazon transactions page...'); console.log('Please log in manually if needed. The browser will wait for you.'); // Navigate directly to transactions page - this will redirect to login if needed await page.goto('https://www.amazon.com/cpe/yourpayments/transactions', { waitUntil: 'domcontentloaded', timeout: 60000, }); // Wait for user to manually log in if needed console.log('Waiting 15 seconds for you to complete login (if needed)...'); console.log('If you\'re already logged in, the page should load automatically.'); await this.wait(15000); // Navigate to transactions page again in case login redirected elsewhere console.log('Ensuring we\'re on the transactions page...'); await page.goto('https://www.amazon.com/cpe/yourpayments/transactions', { waitUntil: 'domcontentloaded', timeout: 60000, }); await this.wait(3000); // Take a screenshot for debugging await page.screenshot({ path: 'debug-transactions-page.png', fullPage: true }); console.log('Screenshot saved to debug-transactions-page.png'); // Save HTML for inspection const html = await page.content(); await Bun.write('debug-transactions-page.html', html); console.log('HTML saved to debug-transactions-page.html'); // Scrape transactions from the page const orders: AmazonOrder[] = []; console.log('Extracting transaction data...'); // First, let's debug what's actually on the page const debugInfo = await page.evaluate(() => { // Get all elements that might be transactions const allDivs = document.querySelectorAll('div'); const classNames = new Set(); const dataTestIds = new Set(); allDivs.forEach(div => { if (div.className && typeof div.className === 'string') { div.className.split(' ').forEach(cls => { if (cls.toLowerCase().includes('transaction') || cls.toLowerCase().includes('payment') || cls.toLowerCase().includes('order')) { classNames.add(cls); } }); } const testId = div.getAttribute('data-testid'); if (testId) { dataTestIds.add(testId); } }); return { totalDivs: allDivs.length, relevantClasses: Array.from(classNames), dataTestIds: Array.from(dataTestIds), bodyText: document.body.innerText.substring(0, 500), }; }); console.log('Debug Info:'); console.log('- Total divs on page:', debugInfo.totalDivs); console.log('- Relevant class names found:', debugInfo.relevantClasses); console.log('- Data-testid attributes found:', debugInfo.dataTestIds); console.log('- First 500 chars of page:', debugInfo.bodyText); // Extract transaction information from the page const transactions = await page.evaluate(() => { const extracted: any[] = []; // Find all date containers first const dateContainers = document.querySelectorAll('.apx-transaction-date-container'); console.log('Found date containers:', dateContainers.length); dateContainers.forEach((dateContainer) => { const dateText = dateContainer.textContent?.trim() || ''; // Find the next sibling that contains transaction line items let currentElement = dateContainer.nextElementSibling; while (currentElement) { // Stop if we hit the next date container if (currentElement.classList.contains('apx-transaction-date-container')) { break; } // Find all transaction line items within this section const transactionRows = currentElement.querySelectorAll('.apx-transactions-line-item-component-container'); transactionRows.forEach((row) => { try { // Extract amount - look for negative dollar amounts const allText = row.textContent || ''; const amountMatch = allText.match(/-?\$\s*([\d,]+\.\d{2})/); const total = amountMatch ? Math.abs(parseFloat(amountMatch[1].replace(/,/g, ''))) : 0; // Extract order link and ID const orderLink = row.querySelector('a[href*="orderID"]'); const href = orderLink?.getAttribute('href') || ''; const orderIdMatch = href.match(/orderID=([A-Z0-9-]+)/i); const orderId = orderIdMatch ? orderIdMatch[1] : ''; // Extract merchant/description - the last span usually contains it const spans = row.querySelectorAll('span.a-size-base'); let description = 'Amazon Purchase'; if (spans.length > 0) { // The last span usually has the merchant name const merchantSpan = spans[spans.length - 1]; description = merchantSpan?.textContent?.trim() || 'Amazon Purchase'; } console.log(`Transaction: Date="${dateText}", Amount=$${total}, Desc="${description}", Order=${orderId}`); if (dateText && total > 0) { extracted.push({ orderId: orderId || `txn_${Date.now()}_${Math.random().toString(36).substring(7)}`, dateText, total, description, orderUrl: orderLink && href ? (href.startsWith('http') ? href : `https://www.amazon.com${href}`) : '', }); } } catch (err) { console.error('Error extracting transaction:', err); } }); currentElement = currentElement.nextElementSibling; } }); return extracted; }); console.log(`Extracted ${transactions.length} transactions`); if (transactions.length > 0) { console.log('First 3 transactions (raw data):'); transactions.slice(0, 3).forEach((txn, i) => { console.log(` ${i + 1}. Date: "${txn.dateText}", Amount: $${txn.total}, Desc: "${txn.description}"`); }); } else { console.log('WARNING: No transactions were extracted from the page!'); } // Process and filter transactions by date for (const txn of transactions) { try { const txnDate = new Date(txn.dateText); // Check if date is valid and within range if (!isNaN(txnDate.getTime()) && txnDate >= startDate && txnDate <= endDate) { // Check if description suggests it's an Amazon order const isAmazonOrder = txn.description.toLowerCase().includes('amazon') || txn.description.toLowerCase().includes('order') || txn.orderId; if (isAmazonOrder) { orders.push({ orderId: txn.orderId, orderDate: txnDate, total: txn.total, items: [{ title: txn.description || 'Amazon Purchase', price: txn.total, quantity: 1, }], orderUrl: txn.orderUrl || `https://www.amazon.com/cpe/yourpayments/transactions`, }); } } } catch (err) { console.error('Error processing transaction:', err); } } console.log(`Found ${orders.length} Amazon orders within date range`); return orders; } catch (error) { console.error('Error scraping Amazon orders:', error); throw error; } finally { await browser.close(); } } /** * Fetch product details for matched orders * @param orders - Array of Amazon orders that need product details * @returns Updated orders with product information */ async fetchProductDetails(orders: AmazonOrder[]): Promise { const browser = await puppeteer.launch({ headless: false, userDataDir: './.puppeteer_cache', args: ['--window-size=1920,1080'], }); try { const page = await browser.newPage(); await page.setViewport({ width: 1920, height: 1080 }); console.log(`Fetching product details for ${orders.length} orders...`); for (let i = 0; i < orders.length; i++) { const order = orders[i]; // Skip if no order URL or no valid order ID if (!order.orderUrl || !order.orderId || order.orderId.startsWith('txn_')) { console.log(`Skipping order ${i + 1}/${orders.length} - no valid order ID`); continue; } try { console.log(`Fetching details for order ${i + 1}/${orders.length}: ${order.orderId}`); // Navigate to order details page await page.goto(order.orderUrl, { waitUntil: 'domcontentloaded', timeout: 30000, }); await this.wait(2000); // Save debug files for this order page const safeOrderId = order.orderId.replace(/[^a-zA-Z0-9-]/g, '_'); await page.screenshot({ path: `debug-order-${safeOrderId}.png`, fullPage: true }); const html = await page.content(); await Bun.write(`debug-order-${safeOrderId}.html`, html); console.log(` Debug files saved: debug-order-${safeOrderId}.{png,html}`); // Extract product information from the order details page const productDetails = await page.evaluate(() => { const items: any[] = []; // Try multiple selectors for product items const productElements = document.querySelectorAll( '[data-component=purchasedItemsRightGrid]' ); productElements.forEach((elem) => { try { // Get product title const titleElem = elem.querySelector('[data-component=itemTitle]'); const title = titleElem?.textContent?.trim() || ''; // Get price if available const priceElem = elem.querySelector('[data-component=unitPrice]'); const priceText = priceElem?.textContent?.trim() || ''; const priceMatch = priceText.match(/\$\s*([\d,]+\.\d{2})/); const price = priceMatch ? parseFloat(priceMatch[1].replace(/,/g, '')) : 0; // Get quantity if available const quantity = 1; if (title && title.length > 5) { items.push({ title, price, quantity }); } } catch (err) { console.error('Error extracting product:', err); } }); // If no products found, try alternative selectors if (items.length === 0) { const altProducts = document.querySelectorAll('.a-box-group'); altProducts.forEach((box) => { const titleElem = box.querySelector('a.a-link-normal, .a-text-bold'); const title = titleElem?.textContent?.trim() || ''; if (title && title.length > 10 && !title.includes('Track package') && !title.includes('View order')) { items.push({ title, price: 0, quantity: 1 }); } }); } return items; }); if (productDetails.length > 0) { orders[i].items = productDetails; console.log(` Found ${productDetails.length} products:`, productDetails.map(p => p.title).join(', ')); } else { console.log(` No products found, keeping merchant name`); } // Small delay between requests to avoid rate limiting await this.wait(1000); } catch (error) { console.error(`Error fetching order details for ${order.orderId}:`, error); // Keep the default items with merchant name } } console.log('Finished fetching product details'); return orders; } catch (error) { console.error('Error in fetchProductDetails:', error); return orders; } finally { await browser.close(); } } /** * Get order details for a specific order ID * @param orderId - Amazon order ID * @returns Order details */ async getOrderDetails(orderId: string): Promise { const browser = await puppeteer.launch({ headless: false, userDataDir: './.puppeteer_cache', }); try { const page = await browser.newPage(); await page.setUserAgent( 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' ); const orderUrl = `https://www.amazon.com/gp/your-account/order-details?orderID=${orderId}`; await page.goto(orderUrl, { waitUntil: 'networkidle2' }); // Extract detailed order information const orderDetails = await page.evaluate(() => { const orderDateElement = document.querySelector('.order-date-invoice-item'); const orderDateText = orderDateElement?.textContent?.trim() || ''; const totalElement = document.querySelector('.grand-total-price'); const totalText = totalElement?.textContent?.trim().replace(/[^0-9.]/g, '') || '0'; const total = parseFloat(totalText); const items: any[] = []; const itemElements = document.querySelectorAll('.product'); itemElements.forEach((item) => { const titleElement = item.querySelector('.product-title'); const title = titleElement?.textContent?.trim() || ''; const priceElement = item.querySelector('.product-price'); const priceText = priceElement?.textContent?.trim().replace(/[^0-9.]/g, '') || '0'; const price = parseFloat(priceText); const quantityElement = item.querySelector('.quantity'); const quantityText = quantityElement?.textContent?.trim().replace(/[^0-9]/g, '') || '1'; const quantity = parseInt(quantityText); if (title) { items.push({ title, price, quantity }); } }); return { orderDateText, total, items, }; }); return { orderId, orderDate: new Date(orderDetails.orderDateText), total: orderDetails.total, items: orderDetails.items, orderUrl, }; } catch (error) { console.error(`Error getting details for order ${orderId}:`, error); return null; } finally { await browser.close(); } } }