feat(cleanup): Enhanced business data validation and cleaning
- Added confidence scoring system for data quality - Implemented strict validation for emails, phones, and addresses - Added batch processing to prevent LLM overload - Improved error handling and fallback mechanisms - Added caching based on confidence scores Technical changes: - Added regex validation for contact info - Implemented scoring system (0-1 scale) - Added timeout protection for LLM calls - Enhanced post-processing for consistent formatting - Added business type detection for context Breaking changes: None Dependencies: No new dependencies required
This commit is contained in:
parent
fde5b5e318
commit
6bcee39e63
1 changed files with 276 additions and 0 deletions
276
src/lib/services/cleanupService.ts
Normal file
276
src/lib/services/cleanupService.ts
Normal file
|
@ -0,0 +1,276 @@
|
||||||
|
import { OllamaService } from './ollamaService';
|
||||||
|
import { Business } from '../types';
|
||||||
|
import { db } from './databaseService';
|
||||||
|
|
||||||
|
// Constants for validation and scoring
|
||||||
|
const BATCH_SIZE = 3; // Process businesses in small batches to avoid overwhelming LLM
|
||||||
|
const LLM_TIMEOUT = 30000; // 30 second timeout for LLM requests
|
||||||
|
const MIN_CONFIDENCE_SCORE = 0.7; // Minimum score required to cache results
|
||||||
|
const VALID_EMAIL_REGEX = /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/;
|
||||||
|
const VALID_PHONE_REGEX = /^\(\d{3}\) \d{3}-\d{4}$/;
|
||||||
|
const VALID_ADDRESS_REGEX = /^\d+.*(?:street|st|avenue|ave|road|rd|boulevard|blvd|lane|ln|drive|dr|court|ct|circle|cir|way|parkway|pkwy|place|pl),?\s+[a-z ]+,\s*[a-z]{2}\s+\d{5}$/i;
|
||||||
|
|
||||||
|
export class CleanupService {
|
||||||
|
/**
|
||||||
|
* Attempts to clean business data using LLM with timeout protection.
|
||||||
|
* Falls back to original data if LLM fails or times out.
|
||||||
|
*/
|
||||||
|
private static async cleanWithLLM(prompt: string, originalBusiness: Business): Promise<string> {
|
||||||
|
try {
|
||||||
|
const timeoutPromise = new Promise((_, reject) => {
|
||||||
|
setTimeout(() => reject(new Error('LLM timeout')), LLM_TIMEOUT);
|
||||||
|
});
|
||||||
|
|
||||||
|
const llmPromise = OllamaService.chat([{
|
||||||
|
role: 'user',
|
||||||
|
content: prompt
|
||||||
|
}]);
|
||||||
|
|
||||||
|
const response = await Promise.race([llmPromise, timeoutPromise]);
|
||||||
|
return (response as string).trim();
|
||||||
|
} catch (error) {
|
||||||
|
console.error('LLM cleanup error:', error);
|
||||||
|
// On timeout, return the original values
|
||||||
|
return `
|
||||||
|
Address: ${originalBusiness.address}
|
||||||
|
Phone: ${originalBusiness.phone}
|
||||||
|
Email: ${originalBusiness.email}
|
||||||
|
Description: ${originalBusiness.description}
|
||||||
|
`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculates a confidence score (0-1) for the cleaned business data.
|
||||||
|
* Score is based on:
|
||||||
|
* - Valid email format (0.25)
|
||||||
|
* - Valid phone format (0.25)
|
||||||
|
* - Valid address format (0.25)
|
||||||
|
* - Description quality (0.25)
|
||||||
|
*/
|
||||||
|
private static calculateConfidenceScore(business: Business): number {
|
||||||
|
let score = 0;
|
||||||
|
|
||||||
|
// Valid email adds 0.25
|
||||||
|
if (business.email && VALID_EMAIL_REGEX.test(business.email)) {
|
||||||
|
score += 0.25;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Valid phone adds 0.25
|
||||||
|
if (business.phone && VALID_PHONE_REGEX.test(business.phone)) {
|
||||||
|
score += 0.25;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Valid address adds 0.25
|
||||||
|
if (business.address && VALID_ADDRESS_REGEX.test(business.address)) {
|
||||||
|
score += 0.25;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Description quality checks (0.25 max)
|
||||||
|
if (business.description) {
|
||||||
|
// Length check (0.1)
|
||||||
|
if (business.description.length > 30 && business.description.length < 200) {
|
||||||
|
score += 0.1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Relevance check (0.1)
|
||||||
|
const businessType = this.getBusinessType(business.name);
|
||||||
|
if (business.description.toLowerCase().includes(businessType)) {
|
||||||
|
score += 0.1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// No HTML/markdown (0.05)
|
||||||
|
if (!/[<>[\]()]/.test(business.description)) {
|
||||||
|
score += 0.05;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return score;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determines the type of business based on name keywords.
|
||||||
|
* Used for validating and generating descriptions.
|
||||||
|
*/
|
||||||
|
private static getBusinessType(name: string): string {
|
||||||
|
const types = [
|
||||||
|
'restaurant', 'plumber', 'electrician', 'cafe', 'bar',
|
||||||
|
'salon', 'shop', 'store', 'service'
|
||||||
|
];
|
||||||
|
|
||||||
|
const nameLower = name.toLowerCase();
|
||||||
|
return types.find(type => nameLower.includes(type)) || 'business';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parses LLM response into structured business data.
|
||||||
|
* Expects format: "field: value" for each line.
|
||||||
|
*/
|
||||||
|
private static parseResponse(response: string): Partial<Business> {
|
||||||
|
const cleaned: Partial<Business> = {};
|
||||||
|
const lines = response.split('\n');
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
const [field, ...values] = line.split(':');
|
||||||
|
const value = values.join(':').trim();
|
||||||
|
|
||||||
|
switch (field.toLowerCase().trim()) {
|
||||||
|
case 'address':
|
||||||
|
cleaned.address = value;
|
||||||
|
break;
|
||||||
|
case 'phone':
|
||||||
|
cleaned.phone = value;
|
||||||
|
break;
|
||||||
|
case 'email':
|
||||||
|
cleaned.email = value;
|
||||||
|
break;
|
||||||
|
case 'description':
|
||||||
|
cleaned.description = value;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return cleaned;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Applies validation rules and cleaning to each field.
|
||||||
|
* - Standardizes formats
|
||||||
|
* - Removes invalid data
|
||||||
|
* - Ensures consistent formatting
|
||||||
|
*/
|
||||||
|
private static validateAndClean(business: Business): Business {
|
||||||
|
const cleaned = { ...business };
|
||||||
|
|
||||||
|
// Email validation and cleaning
|
||||||
|
if (cleaned.email) {
|
||||||
|
cleaned.email = cleaned.email
|
||||||
|
.toLowerCase()
|
||||||
|
.replace(/\[|\]|\(mailto:.*?\)/g, '')
|
||||||
|
.replace(/^\d+-\d+/, '')
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
if (!VALID_EMAIL_REGEX.test(cleaned.email) ||
|
||||||
|
['none', 'n/a', 'union office', ''].includes(cleaned.email.toLowerCase())) {
|
||||||
|
cleaned.email = '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phone validation and cleaning
|
||||||
|
if (cleaned.phone) {
|
||||||
|
const digits = cleaned.phone.replace(/\D/g, '');
|
||||||
|
if (digits.length === 10) {
|
||||||
|
cleaned.phone = `(${digits.slice(0,3)}) ${digits.slice(3,6)}-${digits.slice(6)}`;
|
||||||
|
} else {
|
||||||
|
cleaned.phone = '';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Address validation and cleaning
|
||||||
|
if (cleaned.address) {
|
||||||
|
cleaned.address = cleaned.address
|
||||||
|
.replace(/^.*?(?=\d|[A-Z])/s, '')
|
||||||
|
.replace(/^(Sure!.*?:|The business.*?:|.*?address.*?:)(?:\s*\\n)*\s*/si, '')
|
||||||
|
.replace(/\s+/g, ' ')
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
// Standardize state abbreviations
|
||||||
|
cleaned.address = cleaned.address.replace(/\b(Colorado|Colo|Col)\b/gi, 'CO');
|
||||||
|
}
|
||||||
|
|
||||||
|
// Description validation and cleaning
|
||||||
|
if (cleaned.description) {
|
||||||
|
cleaned.description = cleaned.description
|
||||||
|
.replace(/\$\d+(\.\d{2})?/g, '') // Remove prices
|
||||||
|
.replace(/\b(call|email|website|click|visit)\b.*$/i, '') // Remove calls to action
|
||||||
|
.replace(/\s+/g, ' ')
|
||||||
|
.trim();
|
||||||
|
|
||||||
|
const businessType = this.getBusinessType(cleaned.name);
|
||||||
|
if (businessType !== 'business' &&
|
||||||
|
!cleaned.description.toLowerCase().includes(businessType)) {
|
||||||
|
cleaned.description = `${businessType.charAt(0).toUpperCase() + businessType.slice(1)} services in the Denver area.`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return cleaned;
|
||||||
|
}
|
||||||
|
|
||||||
|
static async cleanBusinessRecord(business: Business): Promise<Business> {
|
||||||
|
// Check cache first
|
||||||
|
const cacheKey = `clean:${business.id}`;
|
||||||
|
const cached = await db.getFromCache(cacheKey);
|
||||||
|
if (cached) {
|
||||||
|
console.log('Using cached clean data for:', business.name);
|
||||||
|
return cached;
|
||||||
|
}
|
||||||
|
|
||||||
|
const combinedPrompt = `
|
||||||
|
Clean and format the following business information. For each field, follow the format shown in the examples.
|
||||||
|
The business type appears to be: ${business.name.toLowerCase().includes('restaurant') ? 'restaurant' :
|
||||||
|
business.name.toLowerCase().includes('plumb') ? 'plumber' :
|
||||||
|
business.name.toLowerCase().includes('electric') ? 'electrician' : 'business'}
|
||||||
|
|
||||||
|
Return each field on a new line with the field name followed by a colon.
|
||||||
|
Only return valid data - if something looks wrong or invalid, return an empty string.
|
||||||
|
|
||||||
|
Examples for address:
|
||||||
|
Input: "Sure! Here is the business address in Denver, CO:\\n\\n14100 W 7th Ave, Golden CO 80401"
|
||||||
|
Output: 14100 W 7th Ave, Golden, CO 80401
|
||||||
|
|
||||||
|
Examples for phone:
|
||||||
|
Input: "7203796281"
|
||||||
|
Output: (720) 379-6281
|
||||||
|
Input: "N/A" or "none"
|
||||||
|
Output:
|
||||||
|
|
||||||
|
Examples for email:
|
||||||
|
Input: "379-6281info@brutalpoodledenver.com"
|
||||||
|
Output: info@brutalpoodledenver.com
|
||||||
|
Input: "top-seo-img@2x.jpg" or "Union Office" or "[email]" or "None"
|
||||||
|
Output:
|
||||||
|
|
||||||
|
Examples for description:
|
||||||
|
Input: "The Brutal Noodle $14.00 Beef bone broth, smoked brisket, rice noodles, all the fixins. (GF) Vegan available with tofu & veggie broth $11"
|
||||||
|
Output: Asian fusion restaurant serving bone broth noodles with brisket and vegan options.
|
||||||
|
Input: "Our Denver-based expert plumbers can repair or install any fixture. Commercial services: We're ready to keep your plumbing system operating safely."
|
||||||
|
Output: Professional plumbing services for residential and commercial properties in Denver.
|
||||||
|
|
||||||
|
Business name for context: "${business.name}"
|
||||||
|
Website for context: "${business.website}"
|
||||||
|
|
||||||
|
Now clean these fields:
|
||||||
|
Address: "${business.address}"
|
||||||
|
Phone: "${business.phone}"
|
||||||
|
Email: "${business.email}"
|
||||||
|
Description: "${business.description}"
|
||||||
|
`;
|
||||||
|
|
||||||
|
const response = await this.cleanWithLLM(combinedPrompt, business);
|
||||||
|
const parsed = this.parseResponse(response);
|
||||||
|
const cleaned = this.validateAndClean({ ...business, ...parsed });
|
||||||
|
|
||||||
|
// Only cache if confidence score is high enough
|
||||||
|
const confidence = this.calculateConfidenceScore(cleaned);
|
||||||
|
if (confidence >= MIN_CONFIDENCE_SCORE) {
|
||||||
|
await db.saveToCache(cacheKey, cleaned, 24 * 60 * 60 * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
return cleaned;
|
||||||
|
}
|
||||||
|
|
||||||
|
static async cleanBusinessRecords(businesses: Business[]): Promise<Business[]> {
|
||||||
|
const cleanedBusinesses: Business[] = [];
|
||||||
|
|
||||||
|
// Process in batches
|
||||||
|
for (let i = 0; i < businesses.length; i += BATCH_SIZE) {
|
||||||
|
const batch = businesses.slice(i, i + BATCH_SIZE);
|
||||||
|
const cleanedBatch = await Promise.all(
|
||||||
|
batch.map(business => this.cleanBusinessRecord(business))
|
||||||
|
);
|
||||||
|
cleanedBusinesses.push(...cleanedBatch);
|
||||||
|
}
|
||||||
|
|
||||||
|
return cleanedBusinesses;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Add table
Reference in a new issue