feat(cleanup): Enhanced business data validation and cleaning

- Added confidence scoring system for data quality
- Implemented strict validation for emails, phones, and addresses
- Added batch processing to prevent LLM overload
- Improved error handling and fallback mechanisms
- Added caching based on confidence scores

Technical changes:
- Added regex validation for contact info
- Implemented scoring system (0-1 scale)
- Added timeout protection for LLM calls
- Enhanced post-processing for consistent formatting
- Added business type detection for context

Breaking changes: None
Dependencies: No new dependencies required
This commit is contained in:
eligrinfeld 2025-01-04 20:59:00 -07:00
parent fde5b5e318
commit 6bcee39e63

View file

@ -0,0 +1,276 @@
import { OllamaService } from './ollamaService';
import { Business } from '../types';
import { db } from './databaseService';
// Constants for validation and scoring
const BATCH_SIZE = 3; // Process businesses in small batches to avoid overwhelming LLM
const LLM_TIMEOUT = 30000; // 30 second timeout for LLM requests
const MIN_CONFIDENCE_SCORE = 0.7; // Minimum score required to cache results
const VALID_EMAIL_REGEX = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/;
const VALID_PHONE_REGEX = /^\(\d{3}\) \d{3}-\d{4}$/;
const VALID_ADDRESS_REGEX = /^\d+.*(?:street|st|avenue|ave|road|rd|boulevard|blvd|lane|ln|drive|dr|court|ct|circle|cir|way|parkway|pkwy|place|pl),?\s+[a-z ]+,\s*[a-z]{2}\s+\d{5}$/i;
export class CleanupService {
/**
* Attempts to clean business data using LLM with timeout protection.
* Falls back to original data if LLM fails or times out.
*/
private static async cleanWithLLM(prompt: string, originalBusiness: Business): Promise<string> {
try {
const timeoutPromise = new Promise((_, reject) => {
setTimeout(() => reject(new Error('LLM timeout')), LLM_TIMEOUT);
});
const llmPromise = OllamaService.chat([{
role: 'user',
content: prompt
}]);
const response = await Promise.race([llmPromise, timeoutPromise]);
return (response as string).trim();
} catch (error) {
console.error('LLM cleanup error:', error);
// On timeout, return the original values
return `
Address: ${originalBusiness.address}
Phone: ${originalBusiness.phone}
Email: ${originalBusiness.email}
Description: ${originalBusiness.description}
`;
}
}
/**
* Calculates a confidence score (0-1) for the cleaned business data.
* Score is based on:
* - Valid email format (0.25)
* - Valid phone format (0.25)
* - Valid address format (0.25)
* - Description quality (0.25)
*/
private static calculateConfidenceScore(business: Business): number {
let score = 0;
// Valid email adds 0.25
if (business.email && VALID_EMAIL_REGEX.test(business.email)) {
score += 0.25;
}
// Valid phone adds 0.25
if (business.phone && VALID_PHONE_REGEX.test(business.phone)) {
score += 0.25;
}
// Valid address adds 0.25
if (business.address && VALID_ADDRESS_REGEX.test(business.address)) {
score += 0.25;
}
// Description quality checks (0.25 max)
if (business.description) {
// Length check (0.1)
if (business.description.length > 30 && business.description.length < 200) {
score += 0.1;
}
// Relevance check (0.1)
const businessType = this.getBusinessType(business.name);
if (business.description.toLowerCase().includes(businessType)) {
score += 0.1;
}
// No HTML/markdown (0.05)
if (!/[<>[\]()]/.test(business.description)) {
score += 0.05;
}
}
return score;
}
/**
* Determines the type of business based on name keywords.
* Used for validating and generating descriptions.
*/
private static getBusinessType(name: string): string {
const types = [
'restaurant', 'plumber', 'electrician', 'cafe', 'bar',
'salon', 'shop', 'store', 'service'
];
const nameLower = name.toLowerCase();
return types.find(type => nameLower.includes(type)) || 'business';
}
/**
* Parses LLM response into structured business data.
* Expects format: "field: value" for each line.
*/
private static parseResponse(response: string): Partial<Business> {
const cleaned: Partial<Business> = {};
const lines = response.split('\n');
for (const line of lines) {
const [field, ...values] = line.split(':');
const value = values.join(':').trim();
switch (field.toLowerCase().trim()) {
case 'address':
cleaned.address = value;
break;
case 'phone':
cleaned.phone = value;
break;
case 'email':
cleaned.email = value;
break;
case 'description':
cleaned.description = value;
break;
}
}
return cleaned;
}
/**
* Applies validation rules and cleaning to each field.
* - Standardizes formats
* - Removes invalid data
* - Ensures consistent formatting
*/
private static validateAndClean(business: Business): Business {
const cleaned = { ...business };
// Email validation and cleaning
if (cleaned.email) {
cleaned.email = cleaned.email
.toLowerCase()
.replace(/\[|\]|\(mailto:.*?\)/g, '')
.replace(/^\d+-\d+/, '')
.trim();
if (!VALID_EMAIL_REGEX.test(cleaned.email) ||
['none', 'n/a', 'union office', ''].includes(cleaned.email.toLowerCase())) {
cleaned.email = '';
}
}
// Phone validation and cleaning
if (cleaned.phone) {
const digits = cleaned.phone.replace(/\D/g, '');
if (digits.length === 10) {
cleaned.phone = `(${digits.slice(0,3)}) ${digits.slice(3,6)}-${digits.slice(6)}`;
} else {
cleaned.phone = '';
}
}
// Address validation and cleaning
if (cleaned.address) {
cleaned.address = cleaned.address
.replace(/^.*?(?=\d|[A-Z])/s, '')
.replace(/^(Sure!.*?:|The business.*?:|.*?address.*?:)(?:\s*\\n)*\s*/si, '')
.replace(/\s+/g, ' ')
.trim();
// Standardize state abbreviations
cleaned.address = cleaned.address.replace(/\b(Colorado|Colo|Col)\b/gi, 'CO');
}
// Description validation and cleaning
if (cleaned.description) {
cleaned.description = cleaned.description
.replace(/\$\d+(\.\d{2})?/g, '') // Remove prices
.replace(/\b(call|email|website|click|visit)\b.*$/i, '') // Remove calls to action
.replace(/\s+/g, ' ')
.trim();
const businessType = this.getBusinessType(cleaned.name);
if (businessType !== 'business' &&
!cleaned.description.toLowerCase().includes(businessType)) {
cleaned.description = `${businessType.charAt(0).toUpperCase() + businessType.slice(1)} services in the Denver area.`;
}
}
return cleaned;
}
static async cleanBusinessRecord(business: Business): Promise<Business> {
// Check cache first
const cacheKey = `clean:${business.id}`;
const cached = await db.getFromCache(cacheKey);
if (cached) {
console.log('Using cached clean data for:', business.name);
return cached;
}
const combinedPrompt = `
Clean and format the following business information. For each field, follow the format shown in the examples.
The business type appears to be: ${business.name.toLowerCase().includes('restaurant') ? 'restaurant' :
business.name.toLowerCase().includes('plumb') ? 'plumber' :
business.name.toLowerCase().includes('electric') ? 'electrician' : 'business'}
Return each field on a new line with the field name followed by a colon.
Only return valid data - if something looks wrong or invalid, return an empty string.
Examples for address:
Input: "Sure! Here is the business address in Denver, CO:\\n\\n14100 W 7th Ave, Golden CO 80401"
Output: 14100 W 7th Ave, Golden, CO 80401
Examples for phone:
Input: "7203796281"
Output: (720) 379-6281
Input: "N/A" or "none"
Output:
Examples for email:
Input: "379-6281info@brutalpoodledenver.com"
Output: info@brutalpoodledenver.com
Input: "top-seo-img@2x.jpg" or "Union Office" or "[email]" or "None"
Output:
Examples for description:
Input: "The Brutal Noodle $14.00 Beef bone broth, smoked brisket, rice noodles, all the fixins. (GF) Vegan available with tofu & veggie broth $11"
Output: Asian fusion restaurant serving bone broth noodles with brisket and vegan options.
Input: "Our Denver-based expert plumbers can repair or install any fixture. Commercial services: We're ready to keep your plumbing system operating safely."
Output: Professional plumbing services for residential and commercial properties in Denver.
Business name for context: "${business.name}"
Website for context: "${business.website}"
Now clean these fields:
Address: "${business.address}"
Phone: "${business.phone}"
Email: "${business.email}"
Description: "${business.description}"
`;
const response = await this.cleanWithLLM(combinedPrompt, business);
const parsed = this.parseResponse(response);
const cleaned = this.validateAndClean({ ...business, ...parsed });
// Only cache if confidence score is high enough
const confidence = this.calculateConfidenceScore(cleaned);
if (confidence >= MIN_CONFIDENCE_SCORE) {
await db.saveToCache(cacheKey, cleaned, 24 * 60 * 60 * 1000);
}
return cleaned;
}
static async cleanBusinessRecords(businesses: Business[]): Promise<Business[]> {
const cleanedBusinesses: Business[] = [];
// Process in batches
for (let i = 0; i < businesses.length; i += BATCH_SIZE) {
const batch = businesses.slice(i, i + BATCH_SIZE);
const cleanedBatch = await Promise.all(
batch.map(business => this.cleanBusinessRecord(business))
);
cleanedBusinesses.push(...cleanedBatch);
}
return cleanedBusinesses;
}
}