Add project files:
- Add database initialization scripts - Add configuration files - Add documentation - Add public assets - Add source code structure - Update README
This commit is contained in:
parent
372943801d
commit
fde5b5e318
39 changed files with 10099 additions and 187 deletions
111
src/lib/services/businessCrawler.ts
Normal file
111
src/lib/services/businessCrawler.ts
Normal file
|
@ -0,0 +1,111 @@
|
|||
import axios from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
import { Cache } from '../utils/cache';
|
||||
import { RateLimiter } from '../utils/rateLimiter';
|
||||
|
||||
interface CrawlResult {
|
||||
mainContent: string;
|
||||
contactInfo: string;
|
||||
aboutInfo: string;
|
||||
structuredData: any;
|
||||
}
|
||||
|
||||
export class BusinessCrawler {
|
||||
private cache: Cache<CrawlResult>;
|
||||
private rateLimiter: RateLimiter;
|
||||
|
||||
constructor() {
|
||||
this.cache = new Cache<CrawlResult>(60); // 1 hour cache
|
||||
this.rateLimiter = new RateLimiter();
|
||||
}
|
||||
|
||||
async crawlBusinessSite(url: string): Promise<CrawlResult> {
|
||||
// Check cache first
|
||||
const cached = this.cache.get(url);
|
||||
if (cached) return cached;
|
||||
|
||||
await this.rateLimiter.waitForSlot();
|
||||
|
||||
try {
|
||||
const mainPage = await this.fetchPage(url);
|
||||
const $ = cheerio.load(mainPage);
|
||||
|
||||
// Get all important URLs
|
||||
const contactUrl = this.findContactPage($, url);
|
||||
const aboutUrl = this.findAboutPage($, url);
|
||||
|
||||
// Crawl additional pages
|
||||
const [contactPage, aboutPage] = await Promise.all([
|
||||
contactUrl ? this.fetchPage(contactUrl) : '',
|
||||
aboutUrl ? this.fetchPage(aboutUrl) : ''
|
||||
]);
|
||||
|
||||
// Extract structured data
|
||||
const structuredData = this.extractStructuredData($);
|
||||
|
||||
const result = {
|
||||
mainContent: $('body').text(),
|
||||
contactInfo: contactPage,
|
||||
aboutInfo: aboutPage,
|
||||
structuredData
|
||||
};
|
||||
|
||||
this.cache.set(url, result);
|
||||
return result;
|
||||
} catch (error) {
|
||||
console.error(`Failed to crawl ${url}:`, error);
|
||||
return {
|
||||
mainContent: '',
|
||||
contactInfo: '',
|
||||
aboutInfo: '',
|
||||
structuredData: {}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private async fetchPage(url: string): Promise<string> {
|
||||
try {
|
||||
const response = await axios.get(url, {
|
||||
timeout: 10000,
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; BizSearch/1.0; +http://localhost:3000/about)',
|
||||
}
|
||||
});
|
||||
return response.data;
|
||||
} catch (error) {
|
||||
console.error(`Failed to fetch ${url}:`, error);
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
private findContactPage($: cheerio.CheerioAPI, baseUrl: string): string | null {
|
||||
const contactLinks = $('a[href*="contact"], a:contains("Contact")');
|
||||
if (contactLinks.length > 0) {
|
||||
const href = contactLinks.first().attr('href');
|
||||
return href ? new URL(href, baseUrl).toString() : null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private findAboutPage($: cheerio.CheerioAPI, baseUrl: string): string | null {
|
||||
const aboutLinks = $('a[href*="about"], a:contains("About")');
|
||||
if (aboutLinks.length > 0) {
|
||||
const href = aboutLinks.first().attr('href');
|
||||
return href ? new URL(href, baseUrl).toString() : null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private extractStructuredData($: cheerio.CheerioAPI): any {
|
||||
const structuredData: any[] = [];
|
||||
$('script[type="application/ld+json"]').each((_, element) => {
|
||||
try {
|
||||
const data = JSON.parse($(element).html() || '{}');
|
||||
structuredData.push(data);
|
||||
} catch (error) {
|
||||
console.error('Failed to parse structured data:', error);
|
||||
}
|
||||
});
|
||||
return structuredData;
|
||||
}
|
||||
}
|
71
src/lib/services/cacheService.ts
Normal file
71
src/lib/services/cacheService.ts
Normal file
|
@ -0,0 +1,71 @@
|
|||
import { supabase } from '../supabase';
|
||||
import { BusinessData } from '../searxng';
|
||||
|
||||
export class CacheService {
|
||||
static async getCachedResults(category: string, location: string): Promise<BusinessData[] | null> {
|
||||
try {
|
||||
const { data, error } = await supabase
|
||||
.from('search_cache')
|
||||
.select('results')
|
||||
.eq('category', category.toLowerCase())
|
||||
.eq('location', location.toLowerCase())
|
||||
.gt('expires_at', new Date().toISOString())
|
||||
.order('created_at', { ascending: false })
|
||||
.limit(1)
|
||||
.single();
|
||||
|
||||
if (error) throw error;
|
||||
return data ? data.results : null;
|
||||
} catch (error) {
|
||||
console.error('Cache lookup failed:', error);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
static async cacheResults(
|
||||
category: string,
|
||||
location: string,
|
||||
results: BusinessData[],
|
||||
expiresInDays: number = 7
|
||||
): Promise<void> {
|
||||
try {
|
||||
const expiresAt = new Date();
|
||||
expiresAt.setDate(expiresAt.getDate() + expiresInDays);
|
||||
|
||||
const { error } = await supabase
|
||||
.from('search_cache')
|
||||
.insert({
|
||||
query: `${category} in ${location}`,
|
||||
category: category.toLowerCase(),
|
||||
location: location.toLowerCase(),
|
||||
results,
|
||||
expires_at: expiresAt.toISOString()
|
||||
});
|
||||
|
||||
if (error) throw error;
|
||||
} catch (error) {
|
||||
console.error('Failed to cache results:', error);
|
||||
}
|
||||
}
|
||||
|
||||
static async updateCache(
|
||||
category: string,
|
||||
location: string,
|
||||
newResults: BusinessData[]
|
||||
): Promise<void> {
|
||||
try {
|
||||
const { error } = await supabase
|
||||
.from('search_cache')
|
||||
.update({
|
||||
results: newResults,
|
||||
updated_at: new Date().toISOString()
|
||||
})
|
||||
.eq('category', category.toLowerCase())
|
||||
.eq('location', location.toLowerCase());
|
||||
|
||||
if (error) throw error;
|
||||
} catch (error) {
|
||||
console.error('Failed to update cache:', error);
|
||||
}
|
||||
}
|
||||
}
|
107
src/lib/services/dataValidation.ts
Normal file
107
src/lib/services/dataValidation.ts
Normal file
|
@ -0,0 +1,107 @@
|
|||
import { OllamaService } from './ollamaService';
|
||||
|
||||
interface ValidatedBusinessData {
|
||||
name: string;
|
||||
phone: string;
|
||||
email: string;
|
||||
address: string;
|
||||
description: string;
|
||||
hours?: string;
|
||||
isValid: boolean;
|
||||
}
|
||||
|
||||
export class DataValidationService {
|
||||
private ollama: OllamaService;
|
||||
|
||||
constructor() {
|
||||
this.ollama = new OllamaService();
|
||||
}
|
||||
|
||||
async validateAndCleanData(rawText: string): Promise<ValidatedBusinessData> {
|
||||
try {
|
||||
const prompt = `
|
||||
You are a business data validation expert. Extract and validate business information from the following text.
|
||||
Return ONLY a JSON object with the following format, nothing else:
|
||||
{
|
||||
"name": "verified business name",
|
||||
"phone": "formatted phone number or N/A",
|
||||
"email": "verified email address or N/A",
|
||||
"address": "verified physical address or N/A",
|
||||
"description": "short business description",
|
||||
"hours": "business hours if available",
|
||||
"isValid": boolean
|
||||
}
|
||||
|
||||
Rules:
|
||||
1. Phone numbers should be in (XXX) XXX-XXXX format
|
||||
2. Addresses should be properly formatted with street, city, state, zip
|
||||
3. Remove any irrelevant text from descriptions
|
||||
4. Set isValid to true only if name and at least one contact method is found
|
||||
5. Clean up any obvious formatting issues
|
||||
6. Validate email addresses for proper format
|
||||
|
||||
Text to analyze:
|
||||
${rawText}
|
||||
`;
|
||||
|
||||
const response = await this.ollama.generateResponse(prompt);
|
||||
|
||||
try {
|
||||
// Find the JSON object in the response
|
||||
const jsonMatch = response.match(/\{[\s\S]*\}/);
|
||||
if (!jsonMatch) {
|
||||
throw new Error('No JSON found in response');
|
||||
}
|
||||
|
||||
const result = JSON.parse(jsonMatch[0]);
|
||||
return this.validateResult(result);
|
||||
} catch (parseError) {
|
||||
console.error('Failed to parse Ollama response:', parseError);
|
||||
throw parseError;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Data validation failed:', error);
|
||||
return {
|
||||
name: 'Unknown',
|
||||
phone: 'N/A',
|
||||
email: 'N/A',
|
||||
address: 'N/A',
|
||||
description: '',
|
||||
hours: '',
|
||||
isValid: false
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private validateResult(result: any): ValidatedBusinessData {
|
||||
// Ensure all required fields are present
|
||||
const validated: ValidatedBusinessData = {
|
||||
name: this.cleanField(result.name) || 'Unknown',
|
||||
phone: this.formatPhone(result.phone) || 'N/A',
|
||||
email: this.cleanField(result.email) || 'N/A',
|
||||
address: this.cleanField(result.address) || 'N/A',
|
||||
description: this.cleanField(result.description) || '',
|
||||
hours: this.cleanField(result.hours),
|
||||
isValid: Boolean(result.isValid)
|
||||
};
|
||||
|
||||
return validated;
|
||||
}
|
||||
|
||||
private cleanField(value: any): string {
|
||||
if (!value || typeof value !== 'string') return '';
|
||||
return value.trim().replace(/\s+/g, ' ');
|
||||
}
|
||||
|
||||
private formatPhone(phone: string): string {
|
||||
if (!phone || phone === 'N/A') return 'N/A';
|
||||
|
||||
// Extract digits
|
||||
const digits = phone.replace(/\D/g, '');
|
||||
if (digits.length === 10) {
|
||||
return `(${digits.slice(0,3)}) ${digits.slice(3,6)}-${digits.slice(6)}`;
|
||||
}
|
||||
|
||||
return phone;
|
||||
}
|
||||
}
|
53
src/lib/services/healthCheck.ts
Normal file
53
src/lib/services/healthCheck.ts
Normal file
|
@ -0,0 +1,53 @@
|
|||
import axios from 'axios';
|
||||
import { env } from '../../config/env';
|
||||
import { supabase } from '../supabase';
|
||||
|
||||
export class HealthCheckService {
|
||||
static async checkOllama(): Promise<boolean> {
|
||||
try {
|
||||
const response = await axios.get(`${env.ollama.url}/api/tags`);
|
||||
return response.status === 200;
|
||||
} catch (error) {
|
||||
console.error('Ollama health check failed:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static async checkSearxNG(): Promise<boolean> {
|
||||
try {
|
||||
const response = await axios.get(`${env.searxng.currentUrl}/config`);
|
||||
return response.status === 200;
|
||||
} catch (error) {
|
||||
try {
|
||||
const response = await axios.get(`${env.searxng.instances[0]}/config`);
|
||||
return response.status === 200;
|
||||
} catch (fallbackError) {
|
||||
console.error('SearxNG health check failed:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static async checkSupabase(): Promise<boolean> {
|
||||
try {
|
||||
console.log('Checking Supabase connection...');
|
||||
console.log('URL:', env.supabase.url);
|
||||
|
||||
// Just check if we can connect and query, don't care about results
|
||||
const { error } = await supabase
|
||||
.from('businesses')
|
||||
.select('count', { count: 'planned', head: true });
|
||||
|
||||
if (error) {
|
||||
console.error('Supabase query error:', error);
|
||||
return false;
|
||||
}
|
||||
|
||||
console.log('Supabase connection successful');
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.error('Supabase connection failed:', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
36
src/lib/services/ollamaService.ts
Normal file
36
src/lib/services/ollamaService.ts
Normal file
|
@ -0,0 +1,36 @@
|
|||
import axios from 'axios';
|
||||
import { env } from '../../config/env';
|
||||
|
||||
interface OllamaResponse {
|
||||
response: string;
|
||||
context?: number[];
|
||||
}
|
||||
|
||||
export class OllamaService {
|
||||
private url: string;
|
||||
private model: string;
|
||||
|
||||
constructor() {
|
||||
this.url = env.ollama.url;
|
||||
this.model = env.ollama.model;
|
||||
}
|
||||
|
||||
async complete(prompt: string): Promise<string> {
|
||||
try {
|
||||
const response = await axios.post(`${this.url}/api/generate`, {
|
||||
model: this.model,
|
||||
prompt: prompt,
|
||||
stream: false,
|
||||
options: {
|
||||
temperature: 0.7,
|
||||
top_p: 0.9
|
||||
}
|
||||
});
|
||||
|
||||
return response.data.response;
|
||||
} catch (error) {
|
||||
console.error('Ollama completion failed:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
93
src/lib/services/supabaseService.ts
Normal file
93
src/lib/services/supabaseService.ts
Normal file
|
@ -0,0 +1,93 @@
|
|||
import { createClient } from '@supabase/supabase-js';
|
||||
import { env } from '../../config/env';
|
||||
import { BusinessData } from '../searxng';
|
||||
|
||||
export class SupabaseService {
|
||||
private supabase;
|
||||
|
||||
constructor() {
|
||||
this.supabase = createClient(env.supabase.url, env.supabase.anonKey);
|
||||
}
|
||||
|
||||
async upsertBusinesses(businesses: BusinessData[]): Promise<void> {
|
||||
try {
|
||||
console.log('Upserting businesses to Supabase:', businesses.length);
|
||||
|
||||
for (const business of businesses) {
|
||||
try {
|
||||
// Create a unique identifier based on multiple properties
|
||||
const identifier = [
|
||||
business.name.toLowerCase(),
|
||||
business.phone?.replace(/\D/g, ''),
|
||||
business.address?.toLowerCase(),
|
||||
business.website?.toLowerCase()
|
||||
]
|
||||
.filter(Boolean) // Remove empty values
|
||||
.join('_') // Join with underscore
|
||||
.replace(/[^a-z0-9]/g, '_'); // Replace non-alphanumeric chars
|
||||
|
||||
// Log the data being inserted
|
||||
console.log('Upserting business:', {
|
||||
id: identifier,
|
||||
name: business.name,
|
||||
phone: business.phone,
|
||||
email: business.email,
|
||||
address: business.address,
|
||||
rating: business.rating,
|
||||
website: business.website,
|
||||
location: business.location
|
||||
});
|
||||
|
||||
// Check if business exists
|
||||
const { data: existing, error: selectError } = await this.supabase
|
||||
.from('businesses')
|
||||
.select('rating, search_count')
|
||||
.eq('id', identifier)
|
||||
.single();
|
||||
|
||||
if (selectError && selectError.code !== 'PGRST116') {
|
||||
console.error('Error checking existing business:', selectError);
|
||||
}
|
||||
|
||||
// Prepare upsert data
|
||||
const upsertData = {
|
||||
id: identifier,
|
||||
name: business.name,
|
||||
phone: business.phone || null,
|
||||
email: business.email || null,
|
||||
address: business.address || null,
|
||||
rating: existing ? Math.max(business.rating, existing.rating) : business.rating,
|
||||
website: business.website || null,
|
||||
logo: business.logo || null,
|
||||
source: business.source || null,
|
||||
description: business.description || null,
|
||||
latitude: business.location?.lat || null,
|
||||
longitude: business.location?.lng || null,
|
||||
last_updated: new Date().toISOString(),
|
||||
search_count: existing ? existing.search_count + 1 : 1
|
||||
};
|
||||
|
||||
console.log('Upserting with data:', upsertData);
|
||||
|
||||
const { error: upsertError } = await this.supabase
|
||||
.from('businesses')
|
||||
.upsert(upsertData, {
|
||||
onConflict: 'id'
|
||||
});
|
||||
|
||||
if (upsertError) {
|
||||
console.error('Error upserting business:', upsertError);
|
||||
console.error('Failed business data:', upsertData);
|
||||
} else {
|
||||
console.log(`Successfully upserted business: ${business.name}`);
|
||||
}
|
||||
} catch (businessError) {
|
||||
console.error('Error processing business:', business.name, businessError);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error saving businesses to Supabase:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue