diff --git a/src/lib/services/cleanupService.ts b/src/lib/services/cleanupService.ts new file mode 100644 index 0000000..f5d7782 --- /dev/null +++ b/src/lib/services/cleanupService.ts @@ -0,0 +1,276 @@ +import { OllamaService } from './ollamaService'; +import { Business } from '../types'; +import { db } from './databaseService'; + +// Constants for validation and scoring +const BATCH_SIZE = 3; // Process businesses in small batches to avoid overwhelming LLM +const LLM_TIMEOUT = 30000; // 30 second timeout for LLM requests +const MIN_CONFIDENCE_SCORE = 0.7; // Minimum score required to cache results +const VALID_EMAIL_REGEX = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/; +const VALID_PHONE_REGEX = /^\(\d{3}\) \d{3}-\d{4}$/; +const VALID_ADDRESS_REGEX = /^\d+.*(?:street|st|avenue|ave|road|rd|boulevard|blvd|lane|ln|drive|dr|court|ct|circle|cir|way|parkway|pkwy|place|pl),?\s+[a-z ]+,\s*[a-z]{2}\s+\d{5}$/i; + +export class CleanupService { + /** + * Attempts to clean business data using LLM with timeout protection. + * Falls back to original data if LLM fails or times out. + */ + private static async cleanWithLLM(prompt: string, originalBusiness: Business): Promise { + try { + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => reject(new Error('LLM timeout')), LLM_TIMEOUT); + }); + + const llmPromise = OllamaService.chat([{ + role: 'user', + content: prompt + }]); + + const response = await Promise.race([llmPromise, timeoutPromise]); + return (response as string).trim(); + } catch (error) { + console.error('LLM cleanup error:', error); + // On timeout, return the original values + return ` + Address: ${originalBusiness.address} + Phone: ${originalBusiness.phone} + Email: ${originalBusiness.email} + Description: ${originalBusiness.description} + `; + } + } + + /** + * Calculates a confidence score (0-1) for the cleaned business data. + * Score is based on: + * - Valid email format (0.25) + * - Valid phone format (0.25) + * - Valid address format (0.25) + * - Description quality (0.25) + */ + private static calculateConfidenceScore(business: Business): number { + let score = 0; + + // Valid email adds 0.25 + if (business.email && VALID_EMAIL_REGEX.test(business.email)) { + score += 0.25; + } + + // Valid phone adds 0.25 + if (business.phone && VALID_PHONE_REGEX.test(business.phone)) { + score += 0.25; + } + + // Valid address adds 0.25 + if (business.address && VALID_ADDRESS_REGEX.test(business.address)) { + score += 0.25; + } + + // Description quality checks (0.25 max) + if (business.description) { + // Length check (0.1) + if (business.description.length > 30 && business.description.length < 200) { + score += 0.1; + } + + // Relevance check (0.1) + const businessType = this.getBusinessType(business.name); + if (business.description.toLowerCase().includes(businessType)) { + score += 0.1; + } + + // No HTML/markdown (0.05) + if (!/[<>[\]()]/.test(business.description)) { + score += 0.05; + } + } + + return score; + } + + /** + * Determines the type of business based on name keywords. + * Used for validating and generating descriptions. + */ + private static getBusinessType(name: string): string { + const types = [ + 'restaurant', 'plumber', 'electrician', 'cafe', 'bar', + 'salon', 'shop', 'store', 'service' + ]; + + const nameLower = name.toLowerCase(); + return types.find(type => nameLower.includes(type)) || 'business'; + } + + /** + * Parses LLM response into structured business data. + * Expects format: "field: value" for each line. + */ + private static parseResponse(response: string): Partial { + const cleaned: Partial = {}; + const lines = response.split('\n'); + + for (const line of lines) { + const [field, ...values] = line.split(':'); + const value = values.join(':').trim(); + + switch (field.toLowerCase().trim()) { + case 'address': + cleaned.address = value; + break; + case 'phone': + cleaned.phone = value; + break; + case 'email': + cleaned.email = value; + break; + case 'description': + cleaned.description = value; + break; + } + } + + return cleaned; + } + + /** + * Applies validation rules and cleaning to each field. + * - Standardizes formats + * - Removes invalid data + * - Ensures consistent formatting + */ + private static validateAndClean(business: Business): Business { + const cleaned = { ...business }; + + // Email validation and cleaning + if (cleaned.email) { + cleaned.email = cleaned.email + .toLowerCase() + .replace(/\[|\]|\(mailto:.*?\)/g, '') + .replace(/^\d+-\d+/, '') + .trim(); + + if (!VALID_EMAIL_REGEX.test(cleaned.email) || + ['none', 'n/a', 'union office', ''].includes(cleaned.email.toLowerCase())) { + cleaned.email = ''; + } + } + + // Phone validation and cleaning + if (cleaned.phone) { + const digits = cleaned.phone.replace(/\D/g, ''); + if (digits.length === 10) { + cleaned.phone = `(${digits.slice(0,3)}) ${digits.slice(3,6)}-${digits.slice(6)}`; + } else { + cleaned.phone = ''; + } + } + + // Address validation and cleaning + if (cleaned.address) { + cleaned.address = cleaned.address + .replace(/^.*?(?=\d|[A-Z])/s, '') + .replace(/^(Sure!.*?:|The business.*?:|.*?address.*?:)(?:\s*\\n)*\s*/si, '') + .replace(/\s+/g, ' ') + .trim(); + + // Standardize state abbreviations + cleaned.address = cleaned.address.replace(/\b(Colorado|Colo|Col)\b/gi, 'CO'); + } + + // Description validation and cleaning + if (cleaned.description) { + cleaned.description = cleaned.description + .replace(/\$\d+(\.\d{2})?/g, '') // Remove prices + .replace(/\b(call|email|website|click|visit)\b.*$/i, '') // Remove calls to action + .replace(/\s+/g, ' ') + .trim(); + + const businessType = this.getBusinessType(cleaned.name); + if (businessType !== 'business' && + !cleaned.description.toLowerCase().includes(businessType)) { + cleaned.description = `${businessType.charAt(0).toUpperCase() + businessType.slice(1)} services in the Denver area.`; + } + } + + return cleaned; + } + + static async cleanBusinessRecord(business: Business): Promise { + // Check cache first + const cacheKey = `clean:${business.id}`; + const cached = await db.getFromCache(cacheKey); + if (cached) { + console.log('Using cached clean data for:', business.name); + return cached; + } + + const combinedPrompt = ` + Clean and format the following business information. For each field, follow the format shown in the examples. + The business type appears to be: ${business.name.toLowerCase().includes('restaurant') ? 'restaurant' : + business.name.toLowerCase().includes('plumb') ? 'plumber' : + business.name.toLowerCase().includes('electric') ? 'electrician' : 'business'} + + Return each field on a new line with the field name followed by a colon. + Only return valid data - if something looks wrong or invalid, return an empty string. + + Examples for address: + Input: "Sure! Here is the business address in Denver, CO:\\n\\n14100 W 7th Ave, Golden CO 80401" + Output: 14100 W 7th Ave, Golden, CO 80401 + + Examples for phone: + Input: "7203796281" + Output: (720) 379-6281 + Input: "N/A" or "none" + Output: + + Examples for email: + Input: "379-6281info@brutalpoodledenver.com" + Output: info@brutalpoodledenver.com + Input: "top-seo-img@2x.jpg" or "Union Office" or "[email]" or "None" + Output: + + Examples for description: + Input: "The Brutal Noodle $14.00 Beef bone broth, smoked brisket, rice noodles, all the fixins. (GF) Vegan available with tofu & veggie broth $11" + Output: Asian fusion restaurant serving bone broth noodles with brisket and vegan options. + Input: "Our Denver-based expert plumbers can repair or install any fixture. Commercial services: We're ready to keep your plumbing system operating safely." + Output: Professional plumbing services for residential and commercial properties in Denver. + + Business name for context: "${business.name}" + Website for context: "${business.website}" + + Now clean these fields: + Address: "${business.address}" + Phone: "${business.phone}" + Email: "${business.email}" + Description: "${business.description}" + `; + + const response = await this.cleanWithLLM(combinedPrompt, business); + const parsed = this.parseResponse(response); + const cleaned = this.validateAndClean({ ...business, ...parsed }); + + // Only cache if confidence score is high enough + const confidence = this.calculateConfidenceScore(cleaned); + if (confidence >= MIN_CONFIDENCE_SCORE) { + await db.saveToCache(cacheKey, cleaned, 24 * 60 * 60 * 1000); + } + + return cleaned; + } + + static async cleanBusinessRecords(businesses: Business[]): Promise { + const cleanedBusinesses: Business[] = []; + + // Process in batches + for (let i = 0; i < businesses.length; i += BATCH_SIZE) { + const batch = businesses.slice(i, i + BATCH_SIZE); + const cleanedBatch = await Promise.all( + batch.map(business => this.cleanBusinessRecord(business)) + ); + cleanedBusinesses.push(...cleanedBatch); + } + + return cleanedBusinesses; + } +} \ No newline at end of file