feat(cleanup): Enhanced business data validation and cleaning

- Added confidence scoring system (0-1) for data quality
- Implemented strict validation for contact info
- Added batch processing and timeout protection
- Improved error handling with fallbacks
- Added smart caching based on confidence scores

Technical changes:
- Added regex validation for emails, phones, addresses
- Implemented business type detection
- Enhanced post-processing for consistent formatting
- Added JSDoc comments for maintainability

Testing:
- Verified with restaurant and plumber searches
- Confirmed improved data quality
- Validated timeout handling
This commit is contained in:
eligrinfeld 2025-01-04 21:00:55 -07:00
parent 6bcee39e63
commit 66d44c0774

View file

@ -6,6 +6,10 @@ import { OllamaService } from './services/ollamaService';
import { BusinessData } from './types'; import { BusinessData } from './types';
import { db } from './services/databaseService'; import { db } from './services/databaseService';
import { generateBusinessId } from './utils'; import { generateBusinessId } from './utils';
import { extractContactFromHtml, extractCleanAddress } from './utils/scraper';
import { GeocodingService } from './services/geocodingService';
import { cleanAddress, formatPhoneNumber, cleanEmail, cleanDescription } from './utils/dataCleanup';
import { CleanupService } from './services/cleanupService';
// Define interfaces used only in this file // Define interfaces used only in this file
interface SearchResult { interface SearchResult {
@ -16,6 +20,7 @@ interface SearchResult {
email?: string; email?: string;
address?: string; address?: string;
website?: string; website?: string;
rating?: number;
coordinates?: { coordinates?: {
lat: number; lat: number;
lng: number; lng: number;
@ -36,6 +41,7 @@ export async function searchBusinesses(
options: { onProgress?: (status: string, progress: number) => void } = {} options: { onProgress?: (status: string, progress: number) => void } = {}
): Promise<BusinessData[]> { ): Promise<BusinessData[]> {
try { try {
console.log('Processing search query:', query);
const [searchTerm, location] = query.split(' in ').map(s => s.trim()); const [searchTerm, location] = query.split(' in ').map(s => s.trim());
if (!searchTerm || !location) { if (!searchTerm || !location) {
throw new Error('Invalid search query format. Use: "search term in location"'); throw new Error('Invalid search query format. Use: "search term in location"');
@ -45,38 +51,52 @@ export async function searchBusinesses(
// Check cache first // Check cache first
const cacheKey = `search:${searchTerm}:${location}`; const cacheKey = `search:${searchTerm}:${location}`;
const cachedResults = await db.getFromCache(cacheKey); let results = await db.getFromCache(cacheKey);
if (cachedResults) {
console.log('Found cached results');
options.onProgress?.('Retrieved from cache', 100);
return cachedResults;
}
if (!results) {
// Check database for existing businesses // Check database for existing businesses
console.log('Searching database for:', searchTerm, 'in', location);
const existingBusinesses = await db.searchBusinesses(searchTerm, location); const existingBusinesses = await db.searchBusinesses(searchTerm, location);
// Start search immediately
console.log('Starting web search');
const searchPromise = performSearch(searchTerm, location, options);
if (existingBusinesses.length > 0) { if (existingBusinesses.length > 0) {
console.log(`Found ${existingBusinesses.length} existing businesses`); console.log(`Found ${existingBusinesses.length} existing businesses`);
options.onProgress?.('Retrieved from database', 50); options.onProgress?.('Retrieved from database', 50);
// Still perform search but in background
searchAndUpdateInBackground(searchTerm, location);
return existingBusinesses;
} }
options.onProgress?.('Starting search', 10); // Wait for new results
const newResults = await searchPromise;
console.log(`Got ${newResults.length} new results from search`);
// Perform new search // Merge results, removing duplicates by ID
const results = await performSearch(searchTerm, location, options); const allResults = [...existingBusinesses];
for (const result of newResults) {
if (!allResults.some(b => b.id === result.id)) {
allResults.push(result);
}
}
// Cache results console.log(`Total unique results: ${allResults.length}`);
await db.saveToCache(cacheKey, results, env.cache.durationHours * 60 * 60 * 1000);
return results; // Cache combined results
await db.saveToCache(cacheKey, allResults, env.cache.durationHours * 60 * 60 * 1000);
console.log(`Returning ${allResults.length} total results (${existingBusinesses.length} existing + ${newResults.length} new)`);
results = allResults;
}
// Clean all results using LLM
options.onProgress?.('Cleaning data', 75);
const cleanedResults = await CleanupService.cleanBusinessRecords(results);
options.onProgress?.('Search complete', 100);
return cleanedResults;
} catch (error) { } catch (error) {
console.error('Search error:', error); console.error('Search error:', error);
return []; // Return empty array on error return [];
} }
} }
@ -144,31 +164,117 @@ async function performSearch(
// Add other necessary functions (isValidBusinessResult, processResults, etc.) // Add other necessary functions (isValidBusinessResult, processResults, etc.)
function isValidBusinessResult(result: SearchResult): boolean { function isValidBusinessResult(result: SearchResult): boolean {
// Add validation logic here // Skip listing/directory pages and search results
const skipPatterns = [
'tripadvisor.com',
'yelp.com',
'opentable.com',
'restaurants-for-sale',
'guide.michelin.com',
'denver.org',
'/blog/',
'/maps/',
'search?',
'features/',
'/lists/',
'reddit.com',
'eater.com'
];
if (skipPatterns.some(pattern => result.url.toLowerCase().includes(pattern))) {
console.log(`Skipping listing page: ${result.url}`);
return false;
}
// Must have a title
if (!result.title || result.title.length < 2) {
return false;
}
// Skip results that look like articles or lists
const articlePatterns = [
'Best',
'Top',
'Guide',
'Where to',
'Welcome to',
'Updated',
'Near',
'Restaurants in'
];
if (articlePatterns.some(pattern => result.title.includes(pattern))) {
console.log(`Skipping article: ${result.title}`);
return false;
}
// Only accept results that look like actual business pages
const businessPatterns = [
'menu',
'reservation',
'location',
'contact',
'about-us',
'home'
];
const hasBusinessPattern = businessPatterns.some(pattern =>
result.url.toLowerCase().includes(pattern) ||
result.content.toLowerCase().includes(pattern)
);
if (!hasBusinessPattern) {
console.log(`Skipping non-business page: ${result.url}`);
return false;
}
return true; return true;
} }
async function processResults(results: SearchResult[], location: string): Promise<BusinessData[]> { async function processResults(results: SearchResult[], location: string): Promise<BusinessData[]> {
const processedResults: BusinessData[] = []; const processedResults: BusinessData[] = [];
const targetCoords = { lat: 0, lng: 0 }; // Replace with actual coordinates
// Get coordinates for the location
const locationGeo = await GeocodingService.geocode(location);
const defaultCoords = locationGeo || { lat: 39.7392, lng: -104.9903 };
for (const result of results) { for (const result of results) {
try { try {
// Extract contact info from webpage
const contactInfo = await extractContactFromHtml(result.url);
// Create initial business record
const business: BusinessData = { const business: BusinessData = {
id: generateBusinessId(result), id: generateBusinessId(result),
name: result.title, name: cleanBusinessName(result.title),
phone: result.phone || '', phone: result.phone || contactInfo.phone || '',
email: result.email || '', email: result.email || contactInfo.email || '',
address: result.address || '', address: result.address || contactInfo.address || '',
rating: 0, rating: result.rating || 0,
website: result.website || result.url || '', website: result.website || result.url || '',
logo: '', logo: '',
source: 'web', source: 'web',
description: result.content || '', description: result.content || contactInfo.description || '',
location: result.coordinates || targetCoords location: defaultCoords,
openingHours: contactInfo.openingHours
}; };
processedResults.push(business); // Clean up the record using LLM
const cleanedBusiness = await CleanupService.cleanBusinessRecord(business);
// Get coordinates for cleaned address
if (cleanedBusiness.address) {
const addressGeo = await GeocodingService.geocode(cleanedBusiness.address);
if (addressGeo) {
cleanedBusiness.location = addressGeo;
}
}
// Only add if we have at least a name and either phone or address
if (cleanedBusiness.name && (cleanedBusiness.phone || cleanedBusiness.address)) {
processedResults.push(cleanedBusiness);
}
} catch (error) { } catch (error) {
console.error(`Error processing result ${result.title}:`, error); console.error(`Error processing result ${result.title}:`, error);
} }
@ -177,6 +283,24 @@ async function processResults(results: SearchResult[], location: string): Promis
return processedResults; return processedResults;
} }
// Helper functions
function cleanBusinessName(name: string): string {
// Remove common suffixes and prefixes
const cleanName = name
.replace(/^(The|A|An)\s+/i, '')
.replace(/\s+(-||—|:).*$/, '')
.replace(/\s*\([^)]*\)/g, '')
.trim();
return cleanName;
}
async function getLocationCoordinates(address: string): Promise<{lat: number, lng: number}> {
// Implement geocoding here
// For now, return default coordinates for Denver
return { lat: 39.7392, lng: -104.9903 };
}
async function searchAndUpdateInBackground(searchTerm: string, location: string) { async function searchAndUpdateInBackground(searchTerm: string, location: string) {
try { try {
const results = await performSearch(searchTerm, location, {}); const results = await performSearch(searchTerm, location, {});