Add project files:

- Add database initialization scripts
- Add configuration files
- Add documentation
- Add public assets
- Add source code structure
- Update README
This commit is contained in:
eligrinfeld 2025-01-04 17:22:46 -07:00
parent 372943801d
commit fde5b5e318
39 changed files with 10099 additions and 187 deletions

View file

@ -0,0 +1,111 @@
import axios from 'axios';
import * as cheerio from 'cheerio';
import { Cache } from '../utils/cache';
import { RateLimiter } from '../utils/rateLimiter';
interface CrawlResult {
mainContent: string;
contactInfo: string;
aboutInfo: string;
structuredData: any;
}
export class BusinessCrawler {
private cache: Cache<CrawlResult>;
private rateLimiter: RateLimiter;
constructor() {
this.cache = new Cache<CrawlResult>(60); // 1 hour cache
this.rateLimiter = new RateLimiter();
}
async crawlBusinessSite(url: string): Promise<CrawlResult> {
// Check cache first
const cached = this.cache.get(url);
if (cached) return cached;
await this.rateLimiter.waitForSlot();
try {
const mainPage = await this.fetchPage(url);
const $ = cheerio.load(mainPage);
// Get all important URLs
const contactUrl = this.findContactPage($, url);
const aboutUrl = this.findAboutPage($, url);
// Crawl additional pages
const [contactPage, aboutPage] = await Promise.all([
contactUrl ? this.fetchPage(contactUrl) : '',
aboutUrl ? this.fetchPage(aboutUrl) : ''
]);
// Extract structured data
const structuredData = this.extractStructuredData($);
const result = {
mainContent: $('body').text(),
contactInfo: contactPage,
aboutInfo: aboutPage,
structuredData
};
this.cache.set(url, result);
return result;
} catch (error) {
console.error(`Failed to crawl ${url}:`, error);
return {
mainContent: '',
contactInfo: '',
aboutInfo: '',
structuredData: {}
};
}
}
private async fetchPage(url: string): Promise<string> {
try {
const response = await axios.get(url, {
timeout: 10000,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; BizSearch/1.0; +http://localhost:3000/about)',
}
});
return response.data;
} catch (error) {
console.error(`Failed to fetch ${url}:`, error);
return '';
}
}
private findContactPage($: cheerio.CheerioAPI, baseUrl: string): string | null {
const contactLinks = $('a[href*="contact"], a:contains("Contact")');
if (contactLinks.length > 0) {
const href = contactLinks.first().attr('href');
return href ? new URL(href, baseUrl).toString() : null;
}
return null;
}
private findAboutPage($: cheerio.CheerioAPI, baseUrl: string): string | null {
const aboutLinks = $('a[href*="about"], a:contains("About")');
if (aboutLinks.length > 0) {
const href = aboutLinks.first().attr('href');
return href ? new URL(href, baseUrl).toString() : null;
}
return null;
}
private extractStructuredData($: cheerio.CheerioAPI): any {
const structuredData: any[] = [];
$('script[type="application/ld+json"]').each((_, element) => {
try {
const data = JSON.parse($(element).html() || '{}');
structuredData.push(data);
} catch (error) {
console.error('Failed to parse structured data:', error);
}
});
return structuredData;
}
}

View file

@ -0,0 +1,71 @@
import { supabase } from '../supabase';
import { BusinessData } from '../searxng';
export class CacheService {
static async getCachedResults(category: string, location: string): Promise<BusinessData[] | null> {
try {
const { data, error } = await supabase
.from('search_cache')
.select('results')
.eq('category', category.toLowerCase())
.eq('location', location.toLowerCase())
.gt('expires_at', new Date().toISOString())
.order('created_at', { ascending: false })
.limit(1)
.single();
if (error) throw error;
return data ? data.results : null;
} catch (error) {
console.error('Cache lookup failed:', error);
return null;
}
}
static async cacheResults(
category: string,
location: string,
results: BusinessData[],
expiresInDays: number = 7
): Promise<void> {
try {
const expiresAt = new Date();
expiresAt.setDate(expiresAt.getDate() + expiresInDays);
const { error } = await supabase
.from('search_cache')
.insert({
query: `${category} in ${location}`,
category: category.toLowerCase(),
location: location.toLowerCase(),
results,
expires_at: expiresAt.toISOString()
});
if (error) throw error;
} catch (error) {
console.error('Failed to cache results:', error);
}
}
static async updateCache(
category: string,
location: string,
newResults: BusinessData[]
): Promise<void> {
try {
const { error } = await supabase
.from('search_cache')
.update({
results: newResults,
updated_at: new Date().toISOString()
})
.eq('category', category.toLowerCase())
.eq('location', location.toLowerCase());
if (error) throw error;
} catch (error) {
console.error('Failed to update cache:', error);
}
}
}

View file

@ -0,0 +1,107 @@
import { OllamaService } from './ollamaService';
interface ValidatedBusinessData {
name: string;
phone: string;
email: string;
address: string;
description: string;
hours?: string;
isValid: boolean;
}
export class DataValidationService {
private ollama: OllamaService;
constructor() {
this.ollama = new OllamaService();
}
async validateAndCleanData(rawText: string): Promise<ValidatedBusinessData> {
try {
const prompt = `
You are a business data validation expert. Extract and validate business information from the following text.
Return ONLY a JSON object with the following format, nothing else:
{
"name": "verified business name",
"phone": "formatted phone number or N/A",
"email": "verified email address or N/A",
"address": "verified physical address or N/A",
"description": "short business description",
"hours": "business hours if available",
"isValid": boolean
}
Rules:
1. Phone numbers should be in (XXX) XXX-XXXX format
2. Addresses should be properly formatted with street, city, state, zip
3. Remove any irrelevant text from descriptions
4. Set isValid to true only if name and at least one contact method is found
5. Clean up any obvious formatting issues
6. Validate email addresses for proper format
Text to analyze:
${rawText}
`;
const response = await this.ollama.generateResponse(prompt);
try {
// Find the JSON object in the response
const jsonMatch = response.match(/\{[\s\S]*\}/);
if (!jsonMatch) {
throw new Error('No JSON found in response');
}
const result = JSON.parse(jsonMatch[0]);
return this.validateResult(result);
} catch (parseError) {
console.error('Failed to parse Ollama response:', parseError);
throw parseError;
}
} catch (error) {
console.error('Data validation failed:', error);
return {
name: 'Unknown',
phone: 'N/A',
email: 'N/A',
address: 'N/A',
description: '',
hours: '',
isValid: false
};
}
}
private validateResult(result: any): ValidatedBusinessData {
// Ensure all required fields are present
const validated: ValidatedBusinessData = {
name: this.cleanField(result.name) || 'Unknown',
phone: this.formatPhone(result.phone) || 'N/A',
email: this.cleanField(result.email) || 'N/A',
address: this.cleanField(result.address) || 'N/A',
description: this.cleanField(result.description) || '',
hours: this.cleanField(result.hours),
isValid: Boolean(result.isValid)
};
return validated;
}
private cleanField(value: any): string {
if (!value || typeof value !== 'string') return '';
return value.trim().replace(/\s+/g, ' ');
}
private formatPhone(phone: string): string {
if (!phone || phone === 'N/A') return 'N/A';
// Extract digits
const digits = phone.replace(/\D/g, '');
if (digits.length === 10) {
return `(${digits.slice(0,3)}) ${digits.slice(3,6)}-${digits.slice(6)}`;
}
return phone;
}
}

View file

@ -0,0 +1,53 @@
import axios from 'axios';
import { env } from '../../config/env';
import { supabase } from '../supabase';
export class HealthCheckService {
static async checkOllama(): Promise<boolean> {
try {
const response = await axios.get(`${env.ollama.url}/api/tags`);
return response.status === 200;
} catch (error) {
console.error('Ollama health check failed:', error);
return false;
}
}
static async checkSearxNG(): Promise<boolean> {
try {
const response = await axios.get(`${env.searxng.currentUrl}/config`);
return response.status === 200;
} catch (error) {
try {
const response = await axios.get(`${env.searxng.instances[0]}/config`);
return response.status === 200;
} catch (fallbackError) {
console.error('SearxNG health check failed:', error);
return false;
}
}
}
static async checkSupabase(): Promise<boolean> {
try {
console.log('Checking Supabase connection...');
console.log('URL:', env.supabase.url);
// Just check if we can connect and query, don't care about results
const { error } = await supabase
.from('businesses')
.select('count', { count: 'planned', head: true });
if (error) {
console.error('Supabase query error:', error);
return false;
}
console.log('Supabase connection successful');
return true;
} catch (error) {
console.error('Supabase connection failed:', error);
return false;
}
}
}

View file

@ -0,0 +1,36 @@
import axios from 'axios';
import { env } from '../../config/env';
interface OllamaResponse {
response: string;
context?: number[];
}
export class OllamaService {
private url: string;
private model: string;
constructor() {
this.url = env.ollama.url;
this.model = env.ollama.model;
}
async complete(prompt: string): Promise<string> {
try {
const response = await axios.post(`${this.url}/api/generate`, {
model: this.model,
prompt: prompt,
stream: false,
options: {
temperature: 0.7,
top_p: 0.9
}
});
return response.data.response;
} catch (error) {
console.error('Ollama completion failed:', error);
throw error;
}
}
}

View file

@ -0,0 +1,93 @@
import { createClient } from '@supabase/supabase-js';
import { env } from '../../config/env';
import { BusinessData } from '../searxng';
export class SupabaseService {
private supabase;
constructor() {
this.supabase = createClient(env.supabase.url, env.supabase.anonKey);
}
async upsertBusinesses(businesses: BusinessData[]): Promise<void> {
try {
console.log('Upserting businesses to Supabase:', businesses.length);
for (const business of businesses) {
try {
// Create a unique identifier based on multiple properties
const identifier = [
business.name.toLowerCase(),
business.phone?.replace(/\D/g, ''),
business.address?.toLowerCase(),
business.website?.toLowerCase()
]
.filter(Boolean) // Remove empty values
.join('_') // Join with underscore
.replace(/[^a-z0-9]/g, '_'); // Replace non-alphanumeric chars
// Log the data being inserted
console.log('Upserting business:', {
id: identifier,
name: business.name,
phone: business.phone,
email: business.email,
address: business.address,
rating: business.rating,
website: business.website,
location: business.location
});
// Check if business exists
const { data: existing, error: selectError } = await this.supabase
.from('businesses')
.select('rating, search_count')
.eq('id', identifier)
.single();
if (selectError && selectError.code !== 'PGRST116') {
console.error('Error checking existing business:', selectError);
}
// Prepare upsert data
const upsertData = {
id: identifier,
name: business.name,
phone: business.phone || null,
email: business.email || null,
address: business.address || null,
rating: existing ? Math.max(business.rating, existing.rating) : business.rating,
website: business.website || null,
logo: business.logo || null,
source: business.source || null,
description: business.description || null,
latitude: business.location?.lat || null,
longitude: business.location?.lng || null,
last_updated: new Date().toISOString(),
search_count: existing ? existing.search_count + 1 : 1
};
console.log('Upserting with data:', upsertData);
const { error: upsertError } = await this.supabase
.from('businesses')
.upsert(upsertData, {
onConflict: 'id'
});
if (upsertError) {
console.error('Error upserting business:', upsertError);
console.error('Failed business data:', upsertData);
} else {
console.log(`Successfully upserted business: ${business.name}`);
}
} catch (businessError) {
console.error('Error processing business:', business.name, businessError);
}
}
} catch (error) {
console.error('Error saving businesses to Supabase:', error);
throw error;
}
}
}