test: add CI/CD workflow

This commit is contained in:
eligrinfeld 2025-01-05 14:16:31 -07:00
parent 66d44c0774
commit ce97671da3
28 changed files with 11684 additions and 1199 deletions

133
.github/workflows/ci.yml vendored Normal file
View file

@ -0,0 +1,133 @@
---
name: CI/CD
on:
push:
branches: [ main, develop ]
pull_request:
branches: [ main, develop ]
jobs:
test:
runs-on: ubuntu-latest
services:
supabase:
image: supabase/postgres-meta:v0.68.0
env:
POSTGRES_PASSWORD: postgres
POSTGRES_USER: postgres
POSTGRES_DB: postgres
ports:
- 5432:5432
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: Check code formatting
run: npm run format
- name: Run tests with coverage
run: npm run test:coverage
env:
SUPABASE_URL: http://localhost:54321
SUPABASE_KEY: test-key
OLLAMA_URL: http://localhost:11434
SEARXNG_URL: http://localhost:8080
NODE_ENV: test
CACHE_DURATION_DAYS: 7
- name: Upload coverage reports
uses: codecov/codecov-action@v4
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage/lcov.info
fail_ci_if_error: true
build:
needs: test
runs-on: ubuntu-latest
if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || github.ref == 'refs/heads/develop')
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
cache: 'npm'
- name: Install dependencies
run: npm ci
- name: Build
run: npm run build
- name: Upload build artifacts
uses: actions/upload-artifact@v4
with:
name: dist
path: dist/
deploy-staging:
needs: build
runs-on: ubuntu-latest
if: github.event_name == 'push' && github.ref == 'refs/heads/develop'
environment:
name: staging
url: https://staging.example.com
steps:
- uses: actions/checkout@v4
- name: Download build artifacts
uses: actions/download-artifact@v4
with:
name: dist
path: dist/
- name: Deploy to staging
run: |
echo "Deploying to staging environment"
# Add your staging deployment commands here
env:
DEPLOY_KEY: ${{ secrets.DEPLOY_KEY }}
deploy-production:
needs: build
runs-on: ubuntu-latest
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
environment:
name: production
url: https://example.com
steps:
- uses: actions/checkout@v4
- name: Download build artifacts
uses: actions/download-artifact@v4
with:
name: dist
path: dist/
- name: Deploy to production
run: |
echo "Deploying to production environment"
# Add your production deployment commands here
env:
DEPLOY_KEY: ${{ secrets.DEPLOY_KEY }}

53
.gitignore vendored
View file

@ -1,39 +1,32 @@
# Node.js
node_modules/
npm-debug.log
yarn-error.log
# Build output
/.next/
/out/
/dist/
# IDE/Editor specific
.vscode/
.idea/
*.iml
# Environment variables # Environment variables
.env .env
.env.local .env.*
.env.development.local !.env.example
.env.test.local
.env.production.local
# Config files # Dependencies
config.toml node_modules/
yarn-error.log
npm-debug.log
# Log files # Build outputs
logs/ dist/
*.log build/
.next/
# Testing # IDE/Editor
/coverage/ .vscode/
.idea/
*.swp
*.swo
# Miscellaneous # OS
.DS_Store .DS_Store
Thumbs.db Thumbs.db
# Db # Logs
db.sqlite logs/
/searxng *.log
# Cache
.cache/
.npm/

14
config.toml Normal file
View file

@ -0,0 +1,14 @@
[GENERAL]
PORT = 3001 # Port to run the server on
SIMILARITY_MEASURE = "cosine" # "cosine" or "dot"
KEEP_ALIVE = "5m" # How long to keep Ollama models loaded into memory. (Instead of using -1 use "-1m")
[API_KEYS]
OPENAI = "" # OpenAI API key - sk-1234567890abcdef1234567890abcdef
GROQ = "" # Groq API key - gsk_1234567890abcdef1234567890abcdef
ANTHROPIC = "" # Anthropic API key - sk-ant-1234567890abcdef1234567890abcdef
GEMINI = "" # Gemini API key - sk-1234567890abcdef1234567890abcdef
[API_ENDPOINTS]
SEARXNG = "http://localhost:32768" # SearxNG API URL
OLLAMA = "" # Ollama API URL - http://host.docker.internal:11434

17
jest.config.js Normal file
View file

@ -0,0 +1,17 @@
module.exports = {
preset: 'ts-jest',
testEnvironment: 'node',
roots: ['<rootDir>/src'],
testMatch: ['**/__tests__/**/*.ts', '**/?(*.)+(spec|test).ts'],
transform: {
'^.+\\.ts$': 'ts-jest',
},
moduleFileExtensions: ['ts', 'js', 'json', 'node'],
collectCoverageFrom: [
'src/**/*.{ts,js}',
'!src/tests/**',
'!**/node_modules/**',
],
coverageDirectory: 'coverage',
setupFilesAfterEnv: ['<rootDir>/src/tests/setup.ts'],
};

6015
package-lock.json generated

File diff suppressed because it is too large Load diff

View file

@ -11,24 +11,35 @@
"format": "prettier . --check", "format": "prettier . --check",
"format:write": "prettier . --write", "format:write": "prettier . --write",
"test:search": "ts-node src/tests/testSearch.ts", "test:search": "ts-node src/tests/testSearch.ts",
"test:supabase": "ts-node src/tests/supabaseTest.ts" "test:supabase": "ts-node src/tests/supabaseTest.ts",
"test:deepseek": "ts-node src/tests/testDeepseek.ts",
"test": "jest",
"test:watch": "jest --watch",
"test:coverage": "jest --coverage"
}, },
"devDependencies": { "devDependencies": {
"@testing-library/jest-dom": "^6.1.5",
"@types/better-sqlite3": "^7.6.10", "@types/better-sqlite3": "^7.6.10",
"@types/cors": "^2.8.17", "@types/cors": "^2.8.17",
"@types/express": "^4.17.21", "@types/express": "^4.17.21",
"@types/html-to-text": "^9.0.4", "@types/html-to-text": "^9.0.4",
"@types/jest": "^29.5.11",
"@types/multer": "^1.4.12", "@types/multer": "^1.4.12",
"@types/pdf-parse": "^1.1.4", "@types/pdf-parse": "^1.1.4",
"@types/readable-stream": "^4.0.11", "@types/readable-stream": "^4.0.11",
"@types/supertest": "^6.0.2",
"@types/ws": "^8.5.12", "@types/ws": "^8.5.12",
"drizzle-kit": "^0.22.7", "drizzle-kit": "^0.22.7",
"jest": "^29.7.0",
"nodemon": "^3.1.0", "nodemon": "^3.1.0",
"prettier": "^3.2.5", "prettier": "^3.2.5",
"supertest": "^7.0.0",
"ts-jest": "^29.1.1",
"ts-node": "^10.9.2", "ts-node": "^10.9.2",
"typescript": "^5.4.3" "typescript": "^5.4.3"
}, },
"dependencies": { "dependencies": {
"@huggingface/transformers": "latest",
"@iarna/toml": "^2.2.5", "@iarna/toml": "^2.2.5",
"@langchain/anthropic": "^0.2.3", "@langchain/anthropic": "^0.2.3",
"@langchain/community": "^0.2.16", "@langchain/community": "^0.2.16",
@ -52,6 +63,7 @@
"pdf-parse": "^1.1.1", "pdf-parse": "^1.1.1",
"robots-parser": "^3.0.1", "robots-parser": "^3.0.1",
"tesseract.js": "^4.1.4", "tesseract.js": "^4.1.4",
"torch": "latest",
"winston": "^3.13.0", "winston": "^3.13.0",
"ws": "^8.17.1", "ws": "^8.17.1",
"zod": "^3.22.4" "zod": "^3.22.4"

View file

@ -11,7 +11,49 @@ search:
server: server:
secret_key: 'a2fb23f1b02e6ee83875b09826990de0f6bd908b6638e8c10277d415f6ab852b' # Is overwritten by ${SEARXNG_SECRET} secret_key: 'a2fb23f1b02e6ee83875b09826990de0f6bd908b6638e8c10277d415f6ab852b' # Is overwritten by ${SEARXNG_SECRET}
port: 8080
bind_address: "0.0.0.0"
base_url: http://localhost:8080/
engines: engines:
- name: wolframalpha - name: wolframalpha
disabled: false disabled: false
- name: google
engine: google
shortcut: g
disabled: false
- name: bing
engine: bing
shortcut: b
disabled: false
- name: duckduckgo
engine: duckduckgo
shortcut: d
disabled: false
- name: yelp
engine: yelp
shortcut: y
disabled: false
ui:
static_path: ""
templates_path: ""
default_theme: simple
default_locale: en
results_on_new_tab: false
outgoing:
request_timeout: 6.0
max_request_timeout: 10.0
pool_connections: 100
pool_maxsize: 10
enable_http2: true
server:
limiter: false
image_proxy: false
http_protocol_version: "1.0"

View file

@ -15,7 +15,10 @@ const envSchema = z.object({
SEARXNG_INSTANCES: z.string().default('["http://localhost:4000"]'), SEARXNG_INSTANCES: z.string().default('["http://localhost:4000"]'),
MAX_RESULTS_PER_QUERY: z.string().default('50'), MAX_RESULTS_PER_QUERY: z.string().default('50'),
CACHE_DURATION_HOURS: z.string().default('24'), CACHE_DURATION_HOURS: z.string().default('24'),
CACHE_DURATION_DAYS: z.string().default('7') CACHE_DURATION_DAYS: z.string().default('7'),
HUGGING_FACE_API_KEY: z.string({
required_error: "HUGGING_FACE_API_KEY is required in .env"
})
}); });
// Define the final environment type // Define the final environment type
@ -39,6 +42,15 @@ export interface EnvConfig {
durationHours: number; durationHours: number;
durationDays: number; durationDays: number;
}; };
ai: {
model: string;
temperature: number;
maxTokens: number;
batchSize: number;
};
huggingface: {
apiKey: string;
};
} }
// Parse and transform the environment variables // Parse and transform the environment variables
@ -64,5 +76,14 @@ export const env: EnvConfig = {
maxResultsPerQuery: parseInt(rawEnv.MAX_RESULTS_PER_QUERY), maxResultsPerQuery: parseInt(rawEnv.MAX_RESULTS_PER_QUERY),
durationHours: parseInt(rawEnv.CACHE_DURATION_HOURS), durationHours: parseInt(rawEnv.CACHE_DURATION_HOURS),
durationDays: parseInt(rawEnv.CACHE_DURATION_DAYS) durationDays: parseInt(rawEnv.CACHE_DURATION_DAYS)
},
ai: {
model: 'deepseek-ai/deepseek-coder-6.7b-instruct',
temperature: 0.7,
maxTokens: 512,
batchSize: 3
},
huggingface: {
apiKey: rawEnv.HUGGING_FACE_API_KEY
} }
}; };

View file

@ -1,4 +1,4 @@
import { OllamaService } from './ollamaService'; import { DeepSeekService } from './deepseekService';
import { Business } from '../types'; import { Business } from '../types';
import { db } from './databaseService'; import { db } from './databaseService';
@ -21,7 +21,7 @@ export class CleanupService {
setTimeout(() => reject(new Error('LLM timeout')), LLM_TIMEOUT); setTimeout(() => reject(new Error('LLM timeout')), LLM_TIMEOUT);
}); });
const llmPromise = OllamaService.chat([{ const llmPromise = DeepSeekService.chat([{
role: 'user', role: 'user',
content: prompt content: prompt
}]); }]);
@ -205,58 +205,17 @@ export class CleanupService {
return cached; return cached;
} }
const combinedPrompt = ` // Clean using DeepSeek
Clean and format the following business information. For each field, follow the format shown in the examples. const cleaned = await DeepSeekService.cleanBusinessData(business);
The business type appears to be: ${business.name.toLowerCase().includes('restaurant') ? 'restaurant' : const validated = this.validateAndClean({ ...business, ...cleaned });
business.name.toLowerCase().includes('plumb') ? 'plumber' :
business.name.toLowerCase().includes('electric') ? 'electrician' : 'business'}
Return each field on a new line with the field name followed by a colon.
Only return valid data - if something looks wrong or invalid, return an empty string.
Examples for address:
Input: "Sure! Here is the business address in Denver, CO:\\n\\n14100 W 7th Ave, Golden CO 80401"
Output: 14100 W 7th Ave, Golden, CO 80401
Examples for phone:
Input: "7203796281"
Output: (720) 379-6281
Input: "N/A" or "none"
Output:
Examples for email:
Input: "379-6281info@brutalpoodledenver.com"
Output: info@brutalpoodledenver.com
Input: "top-seo-img@2x.jpg" or "Union Office" or "[email]" or "None"
Output:
Examples for description:
Input: "The Brutal Noodle $14.00 Beef bone broth, smoked brisket, rice noodles, all the fixins. (GF) Vegan available with tofu & veggie broth $11"
Output: Asian fusion restaurant serving bone broth noodles with brisket and vegan options.
Input: "Our Denver-based expert plumbers can repair or install any fixture. Commercial services: We're ready to keep your plumbing system operating safely."
Output: Professional plumbing services for residential and commercial properties in Denver.
Business name for context: "${business.name}"
Website for context: "${business.website}"
Now clean these fields:
Address: "${business.address}"
Phone: "${business.phone}"
Email: "${business.email}"
Description: "${business.description}"
`;
const response = await this.cleanWithLLM(combinedPrompt, business);
const parsed = this.parseResponse(response);
const cleaned = this.validateAndClean({ ...business, ...parsed });
// Only cache if confidence score is high enough // Only cache if confidence score is high enough
const confidence = this.calculateConfidenceScore(cleaned); const confidence = this.calculateConfidenceScore(validated);
if (confidence >= MIN_CONFIDENCE_SCORE) { if (confidence >= MIN_CONFIDENCE_SCORE) {
await db.saveToCache(cacheKey, cleaned, 24 * 60 * 60 * 1000); await db.saveToCache(cacheKey, validated, 24 * 60 * 60 * 1000);
} }
return cleaned; return validated;
} }
static async cleanBusinessRecords(businesses: Business[]): Promise<Business[]> { static async cleanBusinessRecords(businesses: Business[]): Promise<Business[]> {

View file

@ -20,11 +20,15 @@ export class DatabaseService {
} }
async searchBusinesses(query: string, location: string): Promise<BusinessData[]> { async searchBusinesses(query: string, location: string): Promise<BusinessData[]> {
try {
const { data, error } = await this.supabase const { data, error } = await this.supabase
.from('businesses') .from('businesses')
.select('*') .select('*')
.textSearch('name', query) .or(
.textSearch('address', location) `name.ilike.%${query}%,` +
`description.ilike.%${query}%`
)
.ilike('address', `%${location}%`)
.order('search_count', { ascending: false }) .order('search_count', { ascending: false })
.limit(env.cache.maxResultsPerQuery); .limit(env.cache.maxResultsPerQuery);
@ -33,7 +37,12 @@ export class DatabaseService {
throw error; throw error;
} }
console.log(`Found ${data?.length || 0} businesses in database`);
return data || []; return data || [];
} catch (error) {
console.error('Error searching businesses:', error);
return [];
}
} }
async saveBusiness(business: Partial<BusinessData>): Promise<void> { async saveBusiness(business: Partial<BusinessData>): Promise<void> {
@ -135,6 +144,21 @@ export class DatabaseService {
throw error; throw error;
} }
} }
async clearCache(pattern?: string): Promise<void> {
try {
const query = pattern ?
'DELETE FROM cache WHERE key LIKE $1' :
'DELETE FROM cache';
await this.supabase
.from('cache')
.delete()
.or(pattern ? `key LIKE $1` : '');
} catch (error) {
console.error('Error clearing cache:', error);
}
}
} }
export const db = new DatabaseService(); export const db = new DatabaseService();

View file

@ -0,0 +1,460 @@
import axios from 'axios';
import { env } from '../../config/env';
import { Business } from '../types';
export class DeepSeekService {
private static OLLAMA_URL = 'http://localhost:11434/api/generate';
private static MODEL_NAME = 'qwen2:0.5b';
private static MAX_ATTEMPTS = 3; // Prevent infinite loops
private static async retryWithBackoff(fn: () => Promise<any>, retries = 5) {
for (let i = 0; i < retries; i++) {
try {
return await fn();
} catch (error) {
if (i === retries - 1) throw error;
// Longer backoff for timeouts
const isTimeout = axios.isAxiosError(error) && error.code === 'ECONNABORTED';
const delay = isTimeout ?
Math.pow(2, i) * 5000 : // 5s, 10s, 20s, 40s, 80s for timeouts
Math.pow(2, i) * 1000; // 1s, 2s, 4s, 8s, 16s for other errors
console.log(`Retry ${i + 1}/${retries} after ${delay/1000}s...`);
await new Promise(resolve => setTimeout(resolve, delay));
}
}
}
private static cleanAddress(address: string): string {
// Remove marketing and extra info first
let cleaned = address
.replace(/[\u{1F300}-\u{1F9FF}]|[\u{2700}-\u{27BF}]|[\u{1F600}-\u{1F64F}]/gu, '') // Remove emojis
.replace(/(?:GET|ORDER|SCHEDULE|CONTACT|DIRECTIONS).*?[:!\n]/i, '') // Remove action words
.replace(/\([^)]*\)/g, '') // Remove parenthetical info
.replace(/(?:Next|Behind|Inside|Near).*$/im, '') // Remove location hints
.split(/[\n\r]+/) // Split into lines
.map(line => line.trim())
.filter(Boolean); // Remove empty lines
// Try to find the line with street address
for (const line of cleaned) {
// Common address patterns
const patterns = [
// Handle suite/unit in street address
/(\d+[^,]+?(?:\s+(?:Suite|Ste|Unit|Apt|Building|Bldg|#)\s*[-A-Z0-9]+)?),\s*([^,]+?),\s*(?:CO|Colorado|COLORADO)[,\s]+(\d{5})/i,
// Basic format
/(\d+[^,]+?),\s*([^,]+?),\s*(?:CO|Colorado|COLORADO)[,\s]+(\d{5})/i,
// No commas
/(\d+[^,]+?)\s+([^,]+?)\s+(?:CO|Colorado|COLORADO)\s+(\d{5})/i,
];
for (const pattern of patterns) {
const match = line.match(pattern);
if (match) {
const [_, street, city, zip] = match;
// Clean and capitalize street address
const cleanedStreet = street
.replace(/\s+/g, ' ')
.replace(/(\d+)/, '$1 ') // Add space after number
.split(' ')
.map(word => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase())
.join(' ');
// Capitalize city
const cleanedCity = city.trim()
.split(' ')
.map(word => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase())
.join(' ');
return `${cleanedStreet}, ${cleanedCity}, CO ${zip}`;
}
}
}
// If no match found, try to extract components
const streetLine = cleaned.find(line => /\d+/.test(line));
if (streetLine) {
const streetMatch = streetLine.match(/(\d+[^,\n]+?)(?:\s+(?:Suite|Ste|Unit|Apt|Building|Bldg|#)\s*[-A-Z0-9]+)?/i);
const zipMatch = cleaned.join(' ').match(/\b(\d{5})\b/);
if (streetMatch && zipMatch) {
const street = streetMatch[0].trim();
const zip = zipMatch[1];
return `${street}, Denver, CO ${zip}`;
}
}
return '';
}
private static manualClean(business: Partial<Business>): Partial<Business> {
const cleaned = { ...business };
// Clean address
if (cleaned.address) {
const cleanedAddress = this.cleanAddress(cleaned.address);
if (cleanedAddress) {
cleaned.address = cleanedAddress;
}
}
// Extract business type first
const businessType = this.detectBusinessType(cleaned.name || '');
// Clean name while preserving core identity
if (cleaned.name) {
cleaned.name = cleaned.name
// Remove emojis and special characters
.replace(/[\u{1F300}-\u{1F9FF}]|[\u{2700}-\u{27BF}]|[\u{1F600}-\u{1F64F}]/gu, '')
// Remove bracketed content but preserve important terms
.replace(/\s*[\[\({](?!(?:BMW|Mercedes|Audi|specialist|certified)).*?[\]\)}]\s*/gi, ' ')
// Remove business suffixes
.replace(/\b(?:LLC|Inc|Corp|Ltd|DBA|Est\.|Since|P\.?C\.?)\b\.?\s*\d*/gi, '')
// Clean up and normalize
.replace(/[^\w\s&'-]/g, ' ')
.replace(/\s+/g, ' ')
.trim()
.replace(/^THE\s+/i, ''); // Remove leading "THE"
}
// Clean phone - handle multiple numbers and formats
if (cleaned.phone) {
// Remove emojis and special characters first
const cleanPhone = cleaned.phone
.replace(/[\u{1F300}-\u{1F9FF}]|[\u{2700}-\u{27BF}]|[\u{1F600}-\u{1F64F}]/gu, '')
.replace(/[^\d]/g, '');
const phoneNumbers = cleanPhone.match(/\d{10,}/g);
if (phoneNumbers?.[0]) {
const mainNumber = phoneNumbers[0].slice(0, 10); // Ensure exactly 10 digits
cleaned.phone = `(${mainNumber.slice(0,3)}) ${mainNumber.slice(3,6)}-${mainNumber.slice(6,10)}`;
}
}
// Clean email - handle multiple emails and formats
if (cleaned.email) {
const emailMatch = cleaned.email.match(/([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})/);
if (emailMatch?.[1]) {
cleaned.email = emailMatch[1];
}
}
// Improved description cleaning
if (cleaned.description) {
const coreDescription = this.extractCoreDescription(cleaned.description, businessType);
cleaned.description = coreDescription;
}
return cleaned;
}
private static detectBusinessType(name: string): string {
const types = {
auto: /\b(?:auto|car|vehicle|BMW|Audi|Mercedes|mechanic|repair|service center)\b/i,
dental: /\b(?:dental|dentist|orthodontic|smile|tooth|teeth)\b/i,
coffee: /\b(?:coffee|cafe|espresso|roaster|brew)\b/i,
plumbing: /\b(?:plumb|plumbing|rooter|drain|pipe)\b/i,
restaurant: /\b(?:restaurant|grill|cuisine|bistro|kitchen)\b/i,
};
for (const [type, pattern] of Object.entries(types)) {
if (pattern.test(name)) return type;
}
return 'business';
}
private static extractCoreDescription(description: string, businessType: string): string {
// Remove all marketing and formatting first
let cleaned = description
.replace(/[\u{1F300}-\u{1F9FF}]|[\u{2700}-\u{27BF}]|[\u{1F600}-\u{1F64F}]/gu, '')
.replace(/\$+\s*[^\s]*\s*(off|special|offer|deal|save|discount|price|cost|free)/gi, '')
.replace(/\b(?:call|email|visit|contact|text|www\.|http|@|book|schedule|appointment)\b.*$/gi, '')
.replace(/#\w+/g, '')
.replace(/\s+/g, ' ')
.trim();
// Extract relevant information based on business type
const typePatterns: { [key: string]: RegExp[] } = {
auto: [
/(?:specialist|specializing)\s+in\s+[^.]+/i,
/(?:certified|ASE)[^.]+mechanic[^.]+/i,
/(?:auto|car|vehicle)\s+(?:service|repair)[^.]+/i
],
dental: [
/(?:dental|orthodontic)\s+(?:care|services)[^.]+/i,
/(?:family|cosmetic|general)\s+dentistry[^.]+/i,
/state-of-the-art\s+facility[^.]+/i
],
coffee: [
/(?:coffee|espresso|pastry|cafe)[^.]+/i,
/(?:organic|fair-trade|fresh)[^.]+/i,
/(?:local|favorite|community)[^.]+coffee[^.]+/i
],
plumbing: [
/(?:plumbing|drain|pipe)\s+(?:service|repair)[^.]+/i,
/(?:professional|expert|master)\s+plumb[^.]+/i,
/(?:residential|commercial)\s+plumbing[^.]+/i
]
};
const relevantPhrases = typePatterns[businessType]?.map(pattern => {
const match = cleaned.match(pattern);
return match ? match[0] : '';
}).filter(Boolean) || [];
if (relevantPhrases.length > 0) {
return relevantPhrases.join('. ');
}
// Fallback to generic description
return `Professional ${businessType} services in Denver area`;
}
private static sanitizeJsonResponse(response: string): string {
return response
// Remove emojis
.replace(/[\u{1F300}-\u{1F9FF}]|[\u{2700}-\u{27BF}]|[\u{1F600}-\u{1F64F}]/gu, '')
// Remove control characters
.replace(/[\u0000-\u001F\u007F-\u009F]/g, '')
// Clean up newlines and spaces
.replace(/\r?\n\s*/g, ' ')
.replace(/\s+/g, ' ')
.trim();
}
static async cleanBusinessData(business: Business, attempt = 0): Promise<Business> {
if (attempt >= this.MAX_ATTEMPTS) {
console.log('Max cleaning attempts reached, applying manual cleaning...');
return {
...business,
...this.manualClean(business)
};
}
// Detect business type first
const businessType = this.detectBusinessType(business.name || '');
const requestId = Math.random().toString(36).substring(7);
const prompt = `<|im_start|>system
You are a data cleaning expert. Clean the business data while preserving its core identity and type.
Request ID: ${requestId} // Force uniqueness
IMPORTANT: Return ONLY plain text without emojis or special characters.
<|im_end|>
<|im_start|>user
Clean this ${businessType} business data by following these rules exactly:
Input Business:
${JSON.stringify(business, null, 2)}
Cleaning Rules:
1. NAME: Remove brackets/braces but preserve core business identity
2. ADDRESS: Format as "street, city, state zip" using state abbreviations
3. PHONE: Extract and format primary phone as "(XXX) XXX-XXXX"
4. EMAIL: Remove markdown/mailto formatting but keep actual email
5. DESCRIPTION: Keep core business info but remove:
- ALL emojis and special characters (return plain text only)
- Prices and special offers
- Contact information
- Marketing language
- Social media elements
Return ONLY clean JSON with the original business identity preserved:
{
"business_info": {
"name": "Keep original business name without formatting",
"address": "Keep original address, properly formatted",
"phone": "Keep original phone number, properly formatted",
"email": "Keep original email without formatting",
"description": "Keep original business description without marketing"
}
}
<|im_end|>`;
const response = await this.chat([{
role: 'user',
content: prompt
}]);
try {
const jsonMatch = response.match(/\{[\s\S]*?\}\s*$/);
if (!jsonMatch) {
throw new Error('No JSON found in response');
}
const sanitizedJson = this.sanitizeJsonResponse(jsonMatch[0]);
const parsed = JSON.parse(sanitizedJson);
const cleaned = {
...business,
...parsed.business_info
};
// Validate and handle type mismatches more strictly
const validationIssues = this.validateCleanedData(cleaned, business);
if (validationIssues.length > 0) {
console.log(`Attempt ${attempt + 1}: Validation issues:`, validationIssues.join(', '));
// If there's a business type mismatch, go straight to manual cleaning
if (validationIssues.some(issue => issue.includes('Business type mismatch'))) {
console.log('Business type mismatch detected, applying manual cleaning...');
return {
...business,
...this.manualClean(business)
};
}
// For other validation issues, try again
return this.cleanBusinessData(cleaned, attempt + 1);
}
return cleaned;
} catch (error) {
console.error('Failed to parse response:', error);
console.log('Raw response:', response);
// Try to sanitize and parse the whole response
try {
const sanitized = this.sanitizeJsonResponse(response);
const fallback = this.parseResponse(sanitized);
return this.cleanBusinessData({ ...business, ...fallback }, attempt + 1);
} catch (parseError) {
console.error('Failed to parse sanitized response:', parseError);
return this.cleanBusinessData({ ...business, ...this.manualClean(business) }, attempt + 1);
}
}
}
private static validateCleanedData(business: Partial<Business>, originalBusiness: Business): string[] {
const issues: string[] = [];
// Stricter business type validation
const originalType = this.detectBusinessType(originalBusiness.name || '');
const cleanedType = this.detectBusinessType(business.name || '');
if (originalType !== 'business') {
if (cleanedType !== originalType) {
issues.push(`Business type mismatch: expected ${originalType}, got ${cleanedType}`);
}
// Verify core identity is preserved
const originalKeywords = originalBusiness.name?.toLowerCase().split(/\W+/).filter(Boolean) || [];
const cleanedKeywords = business.name?.toLowerCase().split(/\W+/).filter(Boolean) || [];
const significantKeywords = originalKeywords.filter(word =>
!['the', 'and', 'llc', 'inc', 'corp', 'ltd', 'dba', 'est'].includes(word)
);
const missingKeywords = significantKeywords.filter(word =>
!cleanedKeywords.some(cleaned => cleaned.includes(word))
);
if (missingKeywords.length > 0) {
issues.push(`Core business identity lost: missing ${missingKeywords.join(', ')}`);
}
}
if (business.name?.includes('[') || business.name?.includes(']')) {
issues.push('Name contains brackets');
}
if (!business.address?.match(/^\d+[^,]+,\s*[^,]+,\s*[A-Z]{2}\s+\d{5}$/)) {
const cleanedAddress = this.cleanAddress(business.address || '');
if (cleanedAddress) {
business.address = cleanedAddress;
} else {
issues.push('Address format incorrect');
}
}
if (!business.phone?.match(/^\(\d{3}\) \d{3}-\d{4}$/)) {
issues.push('Phone format incorrect');
}
if (business.email?.includes('[') || business.email?.includes('mailto:')) {
issues.push('Email contains markdown/mailto');
}
if (business.description?.match(/\$|\b(?:call|email|visit|contact)\b/i)) {
issues.push('Description contains pricing or contact info');
}
return issues;
}
private static async chat(messages: { role: string, content: string }[]) {
return this.retryWithBackoff(async () => {
try {
const response = await axios.post(
this.OLLAMA_URL,
{
model: this.MODEL_NAME,
prompt: messages[0].content,
stream: false,
options: {
temperature: 0.7, // Add some randomness
num_predict: 2048,
stop: ["<|im_end|>", "\n\n"],
top_k: 40, // Allow more variety
top_p: 0.9, // Allow more variety
seed: Date.now(), // Force different results each time
reset: true // Reset context window
}
},
{
headers: {
'Content-Type': 'application/json'
},
timeout: 30000
}
);
return response.data.response;
} catch (error) {
if (axios.isAxiosError(error)) {
if (error.code === 'ECONNREFUSED') {
throw new Error('Ollama server not running');
}
if (error.response?.status === 404) {
throw new Error(`Model ${this.MODEL_NAME} not found. Run: ollama pull ${this.MODEL_NAME}`);
}
}
throw error;
}
});
}
private static parseResponse(response: string) {
const lines = response.split('\n');
const cleaned: Partial<Business> = {};
for (const line of lines) {
const [field, ...values] = line.split(':');
const value = values.join(':').trim();
switch (field.toLowerCase().trim()) {
case 'name':
cleaned.name = value;
break;
case 'address':
cleaned.address = value;
break;
case 'phone':
cleaned.phone = value;
break;
case 'email':
cleaned.email = value;
break;
case 'description':
cleaned.description = value;
break;
}
}
return cleaned;
}
}

View file

@ -0,0 +1,63 @@
import axios from 'axios';
import { sleep } from '../utils/helpers';
interface GeocodingResult {
lat: number;
lng: number;
formattedAddress: string;
}
export class GeocodingService {
private static cache = new Map<string, GeocodingResult>();
private static lastRequestTime = 0;
private static RATE_LIMIT_MS = 1000; // 1 second between requests (Nominatim requirement)
static async geocode(address: string): Promise<GeocodingResult | null> {
// Check cache first
const cached = this.cache.get(address);
if (cached) return cached;
try {
// Rate limiting
const now = Date.now();
const timeSinceLastRequest = now - this.lastRequestTime;
if (timeSinceLastRequest < this.RATE_LIMIT_MS) {
await sleep(this.RATE_LIMIT_MS - timeSinceLastRequest);
}
this.lastRequestTime = Date.now();
const response = await axios.get(
'https://nominatim.openstreetmap.org/search',
{
params: {
q: address,
format: 'json',
limit: 1,
addressdetails: 1
},
headers: {
'User-Agent': 'BusinessFinder/1.0'
}
}
);
if (response.data?.length > 0) {
const result = response.data[0];
const geocoded = {
lat: parseFloat(result.lat),
lng: parseFloat(result.lon),
formattedAddress: result.display_name
};
// Cache the result
this.cache.set(address, geocoded);
return geocoded;
}
return null;
} catch (error) {
console.error('Geocoding error:', error);
return null;
}
}
}

View file

@ -1,35 +1,44 @@
import axios from 'axios'; import axios from 'axios';
import { env } from '../../config/env'; import { env } from '../../config/env';
interface OllamaResponse {
response: string;
context?: number[];
}
export class OllamaService { export class OllamaService {
private url: string; private static readonly baseUrl = env.ollama.url;
private model: string; private static readonly model = env.ollama.model;
constructor() { static async complete(prompt: string): Promise<string> {
this.url = env.ollama.url;
this.model = env.ollama.model;
}
async complete(prompt: string): Promise<string> {
try { try {
const response = await axios.post(`${this.url}/api/generate`, { const response = await axios.post(`${this.baseUrl}/api/generate`, {
model: this.model, model: this.model,
prompt: prompt, prompt: prompt,
stream: false, stream: false
options: {
temperature: 0.7,
top_p: 0.9
}
}); });
if (response.data?.response) {
return response.data.response; return response.data.response;
}
throw new Error('No response from Ollama');
} catch (error) { } catch (error) {
console.error('Ollama completion failed:', error); console.error('Ollama error:', error);
throw error;
}
}
static async chat(messages: { role: 'user' | 'assistant'; content: string }[]): Promise<string> {
try {
const response = await axios.post(`${this.baseUrl}/api/chat`, {
model: this.model,
messages: messages,
stream: false
});
if (response.data?.message?.content) {
return response.data.message.content;
}
throw new Error('No response from Ollama chat');
} catch (error) {
console.error('Ollama chat error:', error);
throw error; throw error;
} }
} }

View file

@ -0,0 +1,97 @@
import { DeepSeekService } from './deepseekService';
import { createClient } from '@supabase/supabase-js';
import { Business } from '../types';
export class SearchService {
private supabase;
private deepseek;
constructor() {
this.supabase = createClient(
process.env.SUPABASE_URL!,
process.env.SUPABASE_KEY!
);
this.deepseek = DeepSeekService;
}
async search(query: string, location: string): Promise<Business[]> {
if (!query || !location) {
throw new Error('Query and location are required');
}
// Check cache first
const cacheKey = `${query}_${location}`.toLowerCase();
const { data: cacheData } = await this.supabase
.from('cache')
.select()
.eq('key', cacheKey)
.single();
if (cacheData && cacheData.value) {
return cacheData.value as Business[];
}
try {
// Perform search
const searchResults = await this.performSearch(query, location);
// Cache results
await this.cacheResults(cacheKey, searchResults);
return searchResults;
} catch (error: any) {
if (error.response?.status === 429) {
throw new Error('Rate limit exceeded');
}
throw error;
}
}
async getBusinessById(id: string): Promise<Business | null> {
const { data, error } = await this.supabase
.from('businesses')
.select()
.eq('id', id)
.single();
if (error || !data) {
return null;
}
return data as Business;
}
private async performSearch(query: string, location: string): Promise<Business[]> {
// Implementation would use DeepSeek service to perform search
// This is a placeholder implementation
const mockBusiness: Business = {
id: 'test_1',
name: "Denver's Best Plumbing",
address: "1234 Main Street, Denver, CO 80202",
phone: "(720) 555-1234",
email: "support@denverplumbing.com",
description: "Professional plumbing services",
source: 'test',
website: 'https://example.com',
rating: 4.8,
location: { lat: 39.7392, lng: -104.9903 },
openingHours: []
};
return [mockBusiness];
}
private async cacheResults(key: string, results: Business[]): Promise<void> {
const expiresAt = new Date();
expiresAt.setDate(expiresAt.getDate() + Number(process.env.CACHE_DURATION_DAYS || 7));
await this.supabase
.from('cache')
.insert([{
key,
value: results,
created_at: new Date().toISOString(),
expires_at: expiresAt.toISOString()
}]);
}
}

View file

@ -1,5 +1,5 @@
export interface BusinessData { export interface Business {
id?: string; id: string;
name: string; name: string;
phone?: string; phone?: string;
email?: string; email?: string;
@ -7,22 +7,16 @@ export interface BusinessData {
rating?: number; rating?: number;
website?: string; website?: string;
logo?: string; logo?: string;
source?: string; source: string;
description?: string; description?: string;
location?: { location?: {
lat: number; lat: number;
lng: number; lng: number;
}; };
latitude?: number;
longitude?: number;
place_id?: string;
photos?: string[];
openingHours?: string[]; openingHours?: string[];
distance?: { services?: string[];
value: number; reviewCount?: number;
unit: string; hours?: string[];
};
last_updated?: string;
search_count?: number;
created_at?: string;
} }
export type BusinessData = Business;

View file

@ -1,3 +1,5 @@
import { Business } from '../types';
export function normalizePhoneNumber(phone: string): string { export function normalizePhoneNumber(phone: string): string {
return phone.replace(/[^\d]/g, ''); return phone.replace(/[^\d]/g, '');
} }
@ -22,9 +24,44 @@ export function calculateReliabilityScore(business: Business): number {
if (business.phone) score += 2; if (business.phone) score += 2;
if (business.website) score += 1; if (business.website) score += 1;
if (business.email) score += 1; if (business.email) score += 1;
if (business.hours) score += 2; if (business.hours?.length) score += 2;
if (business.services.length > 0) score += 1; if (business.services && business.services.length > 0) score += 1;
if (business.reviewCount > 10) score += 2; if (business.reviewCount && business.reviewCount > 10) score += 2;
return score; return score;
} }
export function cleanAddress(address: string): string {
return address
.replace(/^(Sure!|Here is |The business address( is| found in the text is)?:?\n?\s*)/i, '')
.replace(/\n/g, ' ')
.trim();
}
export function formatPhoneNumber(phone: string): string {
// Remove all non-numeric characters
const cleaned = phone.replace(/\D/g, '');
// Format as (XXX) XXX-XXXX
if (cleaned.length === 10) {
return `(${cleaned.slice(0,3)}) ${cleaned.slice(3,6)}-${cleaned.slice(6)}`;
}
// Return original if not 10 digits
return phone;
}
export function cleanEmail(email: string): string {
// Remove phone numbers from email
return email
.replace(/\d{3}-\d{4}/, '')
.replace(/\d{10}/, '')
.trim();
}
export function cleanDescription(description: string): string {
return description
.replace(/^(Description:|About:|Info:)/i, '')
.replace(/\s+/g, ' ')
.trim();
}

18
src/lib/utils/helpers.ts Normal file
View file

@ -0,0 +1,18 @@
export function sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
export function cleanText(text: string): string {
return text
.replace(/\s+/g, ' ')
.replace(/[^\w\s-.,]/g, '')
.trim();
}
export function isValidPhone(phone: string): boolean {
return /^\+?[\d-.()\s]{10,}$/.test(phone);
}
export function isValidEmail(email: string): boolean {
return /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(email);
}

168
src/lib/utils/scraper.ts Normal file
View file

@ -0,0 +1,168 @@
import axios from 'axios';
import * as cheerio from 'cheerio';
import { OllamaService } from '../services/ollamaService';
import { sleep } from './helpers';
const RATE_LIMIT_MS = 1000; // 1 second between requests
let lastRequestTime = 0;
async function rateLimitedRequest(url: string) {
const now = Date.now();
const timeSinceLastRequest = now - lastRequestTime;
if (timeSinceLastRequest < RATE_LIMIT_MS) {
await sleep(RATE_LIMIT_MS - timeSinceLastRequest);
}
lastRequestTime = Date.now();
return axios.get(url, {
timeout: 5000,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; BusinessFinder/1.0; +http://example.com/bot)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5'
}
});
}
export interface ContactInfo {
phone?: string;
email?: string;
address?: string;
description?: string;
openingHours?: string[];
}
export async function extractContactFromHtml(url: string): Promise<ContactInfo> {
try {
const response = await rateLimitedRequest(url);
const $ = cheerio.load(response.data);
// Extract structured data if available
const structuredData = $('script[type="application/ld+json"]')
.map((_, el) => {
try {
return JSON.parse($(el).html() || '');
} catch {
return null;
}
})
.get()
.filter(Boolean);
// Look for LocalBusiness or Restaurant schema
const businessData = structuredData.find(data =>
data['@type'] === 'LocalBusiness' ||
data['@type'] === 'Restaurant'
);
if (businessData) {
return {
phone: businessData.telephone,
email: businessData.email,
address: businessData.address?.streetAddress,
description: businessData.description,
openingHours: businessData.openingHours
};
}
// Fallback to regular HTML parsing
return {
phone: findPhone($),
email: findEmail($),
address: findAddress($),
description: $('meta[name="description"]').attr('content'),
openingHours: findOpeningHours($)
};
} catch (error) {
console.warn(`Error extracting contact info from ${url}:`, error);
return {};
}
}
export async function extractCleanAddress(text: string, location: string): Promise<string> {
try {
const ollama = new OllamaService();
const prompt = `
Extract a business address from this text. The business should be in or near ${location}.
Only return the address, nothing else. If no valid address is found, return an empty string.
Text: ${text}
`;
const response = await OllamaService.complete(prompt);
return response.trim();
} catch (error) {
console.warn('Error extracting address:', error);
return '';
}
}
// Helper functions
function findPhone($: cheerio.CheerioAPI): string | undefined {
// Common phone patterns
const phonePatterns = [
/\b\(?([0-9]{3})\)?[-. ]?([0-9]{3})[-. ]?([0-9]{4})\b/,
/\b(?:Phone|Tel|Contact):\s*([0-9-().+ ]{10,})\b/i
];
for (const pattern of phonePatterns) {
const match = $.text().match(pattern);
if (match) return match[0];
}
return undefined;
}
function findEmail($: cheerio.CheerioAPI): string | undefined {
const emailPattern = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
const match = $.text().match(emailPattern);
return match ? match[0] : undefined;
}
function findAddress($: cheerio.CheerioAPI): string | undefined {
// Look for address in common elements
const addressSelectors = [
'address',
'[itemtype="http://schema.org/PostalAddress"]',
'.address',
'#address',
'[class*="address"]',
'[id*="address"]'
];
for (const selector of addressSelectors) {
const element = $(selector).first();
if (element.length) {
return element.text().trim();
}
}
return undefined;
}
function findOpeningHours($: cheerio.CheerioAPI): string[] {
const hours: string[] = [];
const hoursSelectors = [
'[itemtype="http://schema.org/OpeningHoursSpecification"]',
'.hours',
'#hours',
'[class*="hours"]',
'[id*="hours"]'
];
for (const selector of hoursSelectors) {
const element = $(selector).first();
if (element.length) {
element.find('*').each((_, el) => {
const text = $(el).text().trim();
if (text && !hours.includes(text)) {
hours.push(text);
}
});
}
}
return hours;
}

View file

@ -1,87 +1,60 @@
import { Router } from 'express'; import express from 'express';
import { searchBusinesses } from '../lib/searxng'; import { SearchService } from '../lib/services/searchService';
import { categories } from '../lib/categories';
import { supabase } from '../lib/supabase';
import { BusinessData } from '../lib/types';
const router = Router(); const router = express.Router();
const searchService = new SearchService();
// Categories endpoint // Error handling middleware for JSON parsing errors
router.get('/categories', (req, res) => { router.use((err: any, req: express.Request, res: express.Response, next: express.NextFunction) => {
res.json(categories); if (err instanceof SyntaxError && 'body' in err) {
return res.status(400).json({ error: 'Invalid JSON' });
}
next();
}); });
// Search endpoint // Search endpoint
router.get('/search', async (req, res) => { router.post('/search', async (req, res) => {
try { try {
const query = req.query.q as string; const { query, location } = req.body;
const [searchTerm, location] = query.split(' in ');
if (!query) { if (!query || !location) {
return res.status(400).json({ error: 'Search query is required' }); return res.status(400).json({
} error: 'Query and location are required'
// Set headers for streaming response
res.setHeader('Content-Type', 'application/json');
res.setHeader('Transfer-Encoding', 'chunked');
// First, search in Supabase
const { data: existingResults, error: dbError } = await supabase
.from('businesses')
.select('*')
.or(`name.ilike.%${searchTerm}%, description.ilike.%${searchTerm}%`)
.ilike('address', `%${location}%`);
if (dbError) {
console.error('Supabase search error:', dbError);
}
// Send existing results immediately if there are any
if (existingResults && existingResults.length > 0) {
const chunk = JSON.stringify({
source: 'database',
results: existingResults
}) + '\n';
res.write(chunk);
}
// Start background search
const searchPromise = searchBusinesses(query, {
onProgress: (status, progress) => {
const chunk = JSON.stringify({
source: 'search',
status,
progress,
}) + '\n';
res.write(chunk);
}
}); });
const results = await searchPromise;
// Send final results
const finalChunk = JSON.stringify({
source: 'search',
results,
complete: true
}) + '\n';
res.write(finalChunk);
res.end();
} catch (error: unknown) {
console.error('Search error:', error);
const errorResponse = {
error: 'An error occurred while searching',
details: error instanceof Error ? error.message : 'Unknown error'
};
// Only send error response if headers haven't been sent
if (!res.headersSent) {
res.status(500).json(errorResponse);
} else {
res.write(JSON.stringify(errorResponse));
res.end();
} }
const results = await searchService.search(query, location);
res.json({ results });
} catch (error: any) {
if (error.response?.status === 429) {
return res.status(429).json({
error: 'Rate limit exceeded'
});
}
res.status(500).json({
error: error.message || 'Internal server error'
});
}
});
// Get business details endpoint
router.get('/business/:id', async (req, res) => {
try {
const { id } = req.params;
const business = await searchService.getBusinessById(id);
if (!business) {
return res.status(404).json({
error: 'Business not found'
});
}
res.json(business);
} catch (error: any) {
res.status(500).json({
error: error.message || 'Internal server error'
});
} }
}); });

View file

@ -0,0 +1,139 @@
import { createClient } from '@supabase/supabase-js';
// Mock data type
type MockData = {
businesses: { id: string; name: string };
cache: { key: string; value: { test: boolean } };
};
// Mock Supabase client
jest.mock('@supabase/supabase-js', () => ({
createClient: jest.fn(() => ({
from: jest.fn((table: keyof MockData) => {
const mockData: MockData = {
businesses: { id: 'test_1', name: 'Test Business' },
cache: { key: 'test_key', value: { test: true } }
};
return {
insert: jest.fn(() => ({
select: jest.fn().mockResolvedValue({
data: [mockData[table]],
error: null
})
})),
select: jest.fn(() => ({
eq: jest.fn(() => ({
single: jest.fn().mockResolvedValue({
data: mockData[table],
error: null
}),
gt: jest.fn(() => ({
single: jest.fn().mockResolvedValue({
data: null,
error: null
})
}))
}))
})),
update: jest.fn(() => ({
eq: jest.fn().mockResolvedValue({
error: null
})
})),
delete: jest.fn(() => ({
eq: jest.fn().mockResolvedValue({
error: null
})
}))
};
})
}))
}));
describe('Database Operations', () => {
const supabase = createClient('test-url', 'test-key');
const testBusiness = {
id: `test_${Date.now()}`,
name: 'Test Business',
phone: '(303) 555-1234',
email: 'test@example.com',
address: '123 Test St, Denver, CO 80202',
rating: 5,
website: 'https://test.com',
source: 'test',
description: 'Test description',
location: { lat: 39.7392, lng: -104.9903 },
search_count: 1,
created_at: new Date().toISOString()
};
beforeEach(() => {
jest.clearAllMocks();
});
describe('Business Operations', () => {
it('should insert a business successfully', async () => {
const { data, error } = await supabase
.from('businesses')
.insert([testBusiness])
.select();
expect(error).toBeNull();
expect(data).toBeTruthy();
expect(data![0].name).toBe('Test Business');
});
it('should retrieve a business by id', async () => {
const { data, error } = await supabase
.from('businesses')
.select()
.eq('id', testBusiness.id)
.single();
expect(error).toBeNull();
expect(data).toBeTruthy();
expect(data.name).toBe('Test Business');
});
it('should update a business', async () => {
const { error } = await supabase
.from('businesses')
.update({ name: 'Updated Test Business' })
.eq('id', testBusiness.id);
expect(error).toBeNull();
});
});
describe('Cache Operations', () => {
const testCache = {
key: `test_key_${Date.now()}`,
value: { test: true },
created_at: new Date().toISOString(),
expires_at: new Date(Date.now() + 3600000).toISOString()
};
it('should insert cache entry', async () => {
const { data, error } = await supabase
.from('cache')
.insert([testCache])
.select();
expect(error).toBeNull();
expect(data).toBeTruthy();
});
it('should retrieve cache entry', async () => {
const { data, error } = await supabase
.from('cache')
.select()
.eq('key', testCache.key)
.single();
expect(error).toBeNull();
expect(data.value).toEqual({ test: true });
});
});
});

View file

@ -0,0 +1,92 @@
import { DeepSeekService } from '../../lib/services/deepseekService';
import { Business } from '../../lib/types';
// Mock the DeepSeek service
jest.mock('../../lib/services/deepseekService', () => {
const mockCleanedBusiness = {
name: "Denver's Best Plumbing & Repair",
address: "1234 Main Street, Denver, CO 80202",
phone: "(720) 555-1234",
email: "support@denverplumbing.com",
description: "Professional plumbing services in Denver metro area"
};
return {
DeepSeekService: {
chat: jest.fn().mockResolvedValue(JSON.stringify({
business_info: mockCleanedBusiness
})),
detectBusinessType: jest.fn().mockReturnValue('service'),
sanitizeJsonResponse: jest.fn().mockReturnValue(mockCleanedBusiness),
manualClean: jest.fn().mockReturnValue(mockCleanedBusiness),
cleanBusinessData: jest.fn().mockResolvedValue(mockCleanedBusiness)
}
};
});
describe('DeepSeekService', () => {
describe('cleanBusinessData', () => {
const testBusiness: Business = {
id: 'test_1',
name: "Denver's Best Plumbing & Repair [LLC] (A Family Business)",
address: "Suite 200-B, 1234 Main Street, Denver, Colorado 80202",
phone: "(720) 555-1234",
email: "support@denverplumbing.com",
description: "Professional plumbing services in Denver metro area",
source: 'test',
website: 'https://example.com',
rating: 4.8,
location: { lat: 39.7392, lng: -104.9903 },
openingHours: []
};
beforeEach(() => {
jest.clearAllMocks();
});
it('should clean business name correctly', async () => {
const cleaned = await DeepSeekService.cleanBusinessData(testBusiness);
expect(cleaned.name).not.toMatch(/[\[\]{}()]/);
expect(cleaned.name).toBeTruthy();
});
it('should format phone number correctly', async () => {
const cleaned = await DeepSeekService.cleanBusinessData(testBusiness);
expect(cleaned.phone).toMatch(/^\(\d{3}\) \d{3}-\d{4}$/);
});
it('should clean email address', async () => {
const cleaned = await DeepSeekService.cleanBusinessData(testBusiness);
expect(cleaned.email).not.toMatch(/[\[\]<>()]|mailto:|click|schedule/i);
expect(cleaned.email).toMatch(/^[^\s@]+@[^\s@]+\.[^\s@]+$/);
});
it('should clean description', async () => {
const cleaned = await DeepSeekService.cleanBusinessData(testBusiness);
expect(cleaned.description).not.toMatch(/[\$\d]+%?\s*off|\$/i);
expect(cleaned.description).not.toMatch(/\b(?:call|email|visit|contact|text|www\.|http|@)\b/i);
expect(cleaned.description).not.toMatch(/[📞📧🌐💳☎️📱]/);
expect(cleaned.description).not.toMatch(/#\w+/);
});
});
describe('chat', () => {
it('should return a response from the model', async () => {
const response = await DeepSeekService['chat']([{
role: 'user',
content: 'Test message'
}]);
expect(response).toBeTruthy();
expect(typeof response).toBe('string');
});
it('should handle errors gracefully', async () => {
(DeepSeekService['chat'] as jest.Mock).mockRejectedValueOnce(new Error('Test error'));
await expect(DeepSeekService['chat']([{
role: 'user',
content: 'Test message'
}])).rejects.toThrow('Test error');
});
});
});

View file

@ -0,0 +1,145 @@
import express from 'express';
import request from 'supertest';
import { SearchService } from '../../../lib/services/searchService';
import { Business } from '../../../lib/types';
// Mock SearchService
jest.mock('../../../lib/services/searchService');
describe('API Integration', () => {
let app: express.Application;
const mockBusiness: Business = {
id: 'test_1',
name: "Denver's Best Plumbing",
address: "1234 Main Street, Denver, CO 80202",
phone: "(720) 555-1234",
email: "support@denverplumbing.com",
description: "Professional plumbing services",
source: 'test',
website: 'https://example.com',
rating: 4.8,
location: { lat: 39.7392, lng: -104.9903 },
openingHours: []
};
beforeAll(() => {
app = express();
app.use(express.json());
// Mock SearchService methods
(SearchService.prototype.search as jest.Mock).mockResolvedValue([mockBusiness]);
(SearchService.prototype.getBusinessById as jest.Mock).mockResolvedValue(mockBusiness);
// Add error handling middleware
app.use((err: any, req: express.Request, res: express.Response, next: express.NextFunction) => {
if (err instanceof SyntaxError && 'body' in err) {
return res.status(400).json({ error: 'Invalid JSON' });
}
next(err);
});
// Add routes
app.use('/api', require('../../../routes/api').default);
});
beforeEach(() => {
jest.clearAllMocks();
});
describe('Search Endpoints', () => {
it('should handle search requests', async () => {
const response = await request(app)
.post('/api/search')
.send({
query: 'plumber in Denver',
location: 'Denver, CO'
});
expect(response.status).toBe(200);
expect(response.body).toHaveProperty('results');
expect(Array.isArray(response.body.results)).toBe(true);
expect(response.body.results[0]).toEqual(mockBusiness);
});
it('should handle missing parameters', async () => {
const response = await request(app)
.post('/api/search')
.send({
query: 'plumber in Denver'
// missing location
});
expect(response.status).toBe(400);
expect(response.body).toHaveProperty('error');
});
it('should handle search errors', async () => {
// Mock search error
(SearchService.prototype.search as jest.Mock)
.mockRejectedValueOnce(new Error('Search failed'));
const response = await request(app)
.post('/api/search')
.send({
query: 'plumber in Denver',
location: 'Denver, CO'
});
expect(response.status).toBe(500);
expect(response.body).toHaveProperty('error');
});
});
describe('Business Details Endpoint', () => {
it('should retrieve business details', async () => {
const response = await request(app)
.get('/api/business/test_1');
expect(response.status).toBe(200);
expect(response.body).toEqual(mockBusiness);
});
it('should handle non-existent business', async () => {
// Mock not found
(SearchService.prototype.getBusinessById as jest.Mock)
.mockResolvedValueOnce(null);
const response = await request(app)
.get('/api/business/non_existent');
expect(response.status).toBe(404);
expect(response.body).toHaveProperty('error');
});
});
describe('Error Handling', () => {
it('should handle invalid JSON', async () => {
const response = await request(app)
.post('/api/search')
.set('Content-Type', 'application/json')
.send('{"invalid json"}');
expect(response.status).toBe(400);
expect(response.body).toHaveProperty('error');
expect(response.body.error).toBe('Invalid JSON');
});
it('should handle rate limiting', async () => {
// Mock rate limit error
(SearchService.prototype.search as jest.Mock)
.mockRejectedValueOnce({ response: { status: 429 } });
const response = await request(app)
.post('/api/search')
.send({
query: 'plumber in Denver',
location: 'Denver, CO'
});
expect(response.status).toBe(429);
expect(response.body).toHaveProperty('error');
expect(response.body.error).toBe('Rate limit exceeded');
});
});
});

View file

@ -0,0 +1,162 @@
import { DeepSeekService } from '../../../lib/services/deepseekService';
import { createClient } from '@supabase/supabase-js';
import { SearchService } from '../../../lib/services/searchService';
import { Business } from '../../../lib/types';
// Mock external services
jest.mock('@supabase/supabase-js');
jest.mock('../../../lib/services/deepseekService');
describe('Search Integration', () => {
const mockBusiness: Business = {
id: 'test_1',
name: "Denver's Best Plumbing",
address: "1234 Main Street, Denver, CO 80202",
phone: "(720) 555-1234",
email: "support@denverplumbing.com",
description: "Professional plumbing services",
source: 'test',
website: 'https://example.com',
rating: 4.8,
location: { lat: 39.7392, lng: -104.9903 },
openingHours: []
};
// Mock Supabase responses
const mockSupabase = {
from: jest.fn().mockReturnValue({
insert: jest.fn().mockReturnValue({
select: jest.fn().mockResolvedValue({
data: [mockBusiness],
error: null
})
}),
select: jest.fn().mockReturnValue({
eq: jest.fn().mockReturnValue({
single: jest.fn().mockResolvedValue({
data: null,
error: null
})
})
})
})
};
beforeEach(() => {
jest.clearAllMocks();
(createClient as jest.Mock).mockReturnValue(mockSupabase);
});
describe('Search and Store Flow', () => {
it('should search, clean, and store business data', async () => {
const searchService = new SearchService();
const query = 'plumber in Denver';
const location = 'Denver, CO';
// Mock performSearch to return results
const performSearchSpy = jest.spyOn(searchService as any, 'performSearch')
.mockResolvedValue([mockBusiness]);
// Perform search
const results = await searchService.search(query, location);
// Verify search results
expect(results).toBeTruthy();
expect(Array.isArray(results)).toBe(true);
expect(results[0]).toEqual(mockBusiness);
// Verify cache was checked first
expect(mockSupabase.from).toHaveBeenCalledWith('cache');
// Verify results were cached
expect(mockSupabase.from).toHaveBeenCalledWith('cache');
expect(mockSupabase.from().insert).toHaveBeenCalled();
});
it('should handle search errors gracefully', async () => {
const searchService = new SearchService();
// Mock performSearch to throw error
jest.spyOn(searchService as any, 'performSearch')
.mockRejectedValue(new Error('Search failed'));
await expect(searchService.search('invalid query', 'invalid location'))
.rejects.toThrow('Search failed');
});
it('should use cache when available', async () => {
const searchService = new SearchService();
const query = 'plumber in Denver';
const location = 'Denver, CO';
// Mock cache hit
mockSupabase.from.mockReturnValueOnce({
select: jest.fn().mockReturnValue({
eq: jest.fn().mockReturnValue({
single: jest.fn().mockResolvedValue({
data: { value: [mockBusiness] },
error: null
})
})
})
});
const results = await searchService.search(query, location);
// Verify cache was checked
expect(mockSupabase.from).toHaveBeenCalledWith('cache');
expect(results).toEqual([mockBusiness]);
// Verify performSearch was not called
expect(jest.spyOn(searchService as any, 'performSearch')).not.toHaveBeenCalled();
});
it('should handle rate limiting', async () => {
const searchService = new SearchService();
// Mock performSearch to throw rate limit error
jest.spyOn(searchService as any, 'performSearch')
.mockRejectedValue({ response: { status: 429 } });
const query = 'plumber in Denver';
const location = 'Denver, CO';
await expect(searchService.search(query, location))
.rejects.toThrow('Rate limit exceeded');
});
});
describe('Data Consistency', () => {
it('should maintain data consistency between search and retrieval', async () => {
const searchService = new SearchService();
const query = 'plumber in Denver';
const location = 'Denver, CO';
// Mock performSearch to return results
jest.spyOn(searchService as any, 'performSearch')
.mockResolvedValue([mockBusiness]);
// Perform search
const searchResults = await searchService.search(query, location);
const firstResult = searchResults[0];
// Mock database retrieval
mockSupabase.from.mockReturnValueOnce({
select: jest.fn().mockReturnValue({
eq: jest.fn().mockReturnValue({
single: jest.fn().mockResolvedValue({
data: firstResult,
error: null
})
})
})
});
// Retrieve the same business
const retrieved = await searchService.getBusinessById(firstResult.id);
// Verify data consistency
expect(retrieved).toEqual(firstResult);
});
});
});

22
src/tests/setup.ts Normal file
View file

@ -0,0 +1,22 @@
import dotenv from 'dotenv';
// Load environment variables for testing
dotenv.config({ path: '.env.test' });
// Set default timeout for all tests
jest.setTimeout(10000);
// Global setup
beforeAll(() => {
// Add any global setup here
});
// Global teardown
afterAll(() => {
// Add any global cleanup here
});
// Reset mocks between tests
afterEach(() => {
jest.clearAllMocks();
});

202
src/tests/testDeepseek.ts Normal file
View file

@ -0,0 +1,202 @@
import { DeepSeekService } from '../lib/services/deepseekService';
import { Business } from '../lib/types';
import axios from 'axios';
async function testOllamaConnection() {
console.log('🔍 Testing Ollama connection...\n');
try {
// Test simple connection
console.log('Testing Qwen model...');
const response = await DeepSeekService['chat']([{
role: 'user',
content: 'Say "Hello, testing Qwen model!"'
}]);
console.log('✅ Model Response:', response);
return true;
} catch (error) {
if (error instanceof Error) {
console.error('❌ Connection test failed:', error.message);
if (axios.isAxiosError(error)) {
if (error.code === 'ECONNREFUSED') {
console.error('❌ Make sure Ollama is running (ollama serve)');
} else {
console.error('API Error details:', error.response?.data);
}
}
} else {
console.error('❌ Connection test failed with unknown error');
}
return false;
}
}
async function testDataCleaning() {
console.log('\n🧪 Testing business data cleaning...');
const testCases: Business[] = [
{
id: 'test_1',
name: "Denver's Best Plumbing & Repair [LLC] (A Family Business) {Est. 1995}",
address: "CONTACT US TODAY! Suite 200-B, 1234 Main Street, Denver, Colorado 80202 (Near Starbucks)",
phone: "☎️ Main: (720) 555-1234 | Emergency: 1-800-555-9999 | Text: 720.555.4321",
email: "[support@denverplumbing.com](mailto:support@denverplumbing.com) or info@denverplumbing.com",
description: `$$$ LIMITED TIME OFFER $$$
🚰 Professional plumbing services in Denver metro area
💰 20% OFF all repairs over $500!
Family owned since 1995
📞 Available 24/7 for emergencies
🌐 Visit www.denverplumbing.com
📧 Email us at contact@denverplumbing.com
💳 All major credit cards accepted
#DenverPlumbing #EmergencyService`,
source: 'test',
website: 'https://example.com',
rating: 4.8,
logo: 'logo.png',
location: { lat: 39.7392, lng: -104.9903 },
openingHours: []
},
{
id: 'test_2',
name: "[MIKE'S AUTO] {{CERTIFIED}} [BMW & AUDI SPECIALIST]",
address: "GET DIRECTIONS: 5678 Auto Row Drive\nUnit C-123\nDenver, CO 80205\nBehind Home Depot",
phone: "Sales: 303-555-0000\nService: (303) 555-1111\nFax: 303.555.2222",
email: "appointments@mikesauto.com <click to email> [Schedule Now](https://booking.mikesauto.com)",
description: `🚗 Denver's Premier Auto Service Center
💯 ASE Certified Mechanics
🔧 Specializing in German Luxury Vehicles
💰💰💰 Spring Special: Free oil change with any service over $300
Same-day service available
🎯 Located in central Denver
📱 Text "REPAIR" to 80205 for $50 off
Over 500 5-star reviews!`,
source: 'test',
website: 'https://mikesauto.com',
rating: 4.9,
logo: 'logo.png',
location: { lat: 39.7599, lng: -104.9987 },
openingHours: ['Mon-Fri 8-6', 'Sat 9-3']
},
{
id: 'test_3',
name: "🌟 SUNSHINE DENTAL & ORTHODONTICS, P.C. [Dr. Smith & Associates] (Voted #1)",
address: "SCHEDULE TODAY!\n🦷 Building 3, Suite 300\n9876 Medical Plaza Way\nDENVER COLORADO, 80210\nNext to Target",
phone: "📞 New Patients: 1 (720) 999-8888 | Existing: 720.999.7777 | After Hours: +1-720-999-6666",
email: "appointments@sunshinedentalco.com, info@sunshinedentalco.com, emergency@sunshinedentalco.com",
description: `✨ Your Premier Dental Care Provider in Denver! ✨
🦷 State-of-the-art facility
💎 Cosmetic & General Dentistry
👶 Family-friendly environment
💰 NEW PATIENT SPECIAL: $99 Cleaning & Exam (Reg. $299)
🏥 Most insurance accepted
1,000+ 5-star reviews on Google
🎁 Refer a friend and get $50 credit
📱 Download our app: smile.sunshinedentalco.com`,
source: 'test',
website: 'https://sunshinedentalco.com',
rating: 5.0,
logo: 'logo.png',
location: { lat: 39.7120, lng: -104.9412 },
openingHours: ['Mon-Thu 8-5', 'Fri 8-2', 'Sat By Appt']
},
{
id: 'test_4',
name: "THE COFFEE SPOT ☕️ {{NOW OPEN}} [Under New Management!]",
address: "ORDER PICKUP:\nGround Floor\n4321 Downtown Street\nDenver, CO. 80203\nInside Union Station",
phone: "☎️ Store: 303•777•5555\n💬 Text Orders: 303-777-4444",
email: "<Order Online> orders@thecoffeespot.co [Click Here](https://order.thecoffeespot.co)",
description: `☕️ Denver's Favorite Coffee Shop Since 2020!
🌱 Organic, Fair-Trade Coffee
🥐 Fresh-Baked Pastries Daily
MORNING RUSH SPECIAL: $2 off any drink before 9am!
🎯 Loyalty Program: Buy 9, Get 1 FREE
📱 Order ahead on our app
🎁 Student Discount: 10% off with ID
#CoffeeLovers #DenverCoffee #MorningFuel
Follow us @thecoffeespot for daily specials!`,
source: 'test',
website: 'https://thecoffeespot.co',
rating: 4.7,
logo: 'logo.png',
location: { lat: 39.7508, lng: -104.9997 },
openingHours: ['Mon-Fri 6-8', 'Sat-Sun 7-7']
}
];
for (const testCase of testCases) {
console.log('\nTesting case:', testCase.id);
console.log('Input data:', JSON.stringify(testCase, null, 2));
console.time('Cleaning Duration');
const cleaned = await DeepSeekService.cleanBusinessData(testCase);
console.timeEnd('Cleaning Duration');
console.log('\nCleaned data:', JSON.stringify(cleaned, null, 2));
// Validate the results
const validationIssues = [];
// Name validation
if (cleaned.name?.match(/[\[\]{}()]/)) {
validationIssues.push('Name contains brackets/braces/parentheses');
}
// Address validation
if (!cleaned.address?.match(/^\d+[^,]+,\s*[^,]+,\s*[A-Z]{2}\s+\d{5}$/)) {
validationIssues.push('Address format incorrect');
}
// Phone validation
if (!cleaned.phone?.match(/^\(\d{3}\) \d{3}-\d{4}$/)) {
validationIssues.push('Phone format incorrect');
}
// Email validation
if (cleaned.email?.match(/[\[\]<>()]|mailto:|click|schedule/i)) {
validationIssues.push('Email contains formatting/links');
}
// Description validation
const descriptionIssues = [];
if (cleaned.description?.match(/[\$\d]+%?\s*off|\$/i)) {
descriptionIssues.push('contains pricing');
}
if (cleaned.description?.match(/\b(?:call|email|visit|contact|text|www\.|http|@)\b/i)) {
descriptionIssues.push('contains contact info');
}
if (cleaned.description?.match(/[📞📧🌐💳☎️📱]/)) {
descriptionIssues.push('contains emojis');
}
if (cleaned.description?.match(/#\w+/)) {
descriptionIssues.push('contains hashtags');
}
if (descriptionIssues.length > 0) {
validationIssues.push(`Description ${descriptionIssues.join(', ')}`);
}
if (validationIssues.length > 0) {
console.log('\n⚠ Validation issues:', validationIssues.join(', '));
} else {
console.log('\n✅ All fields cleaned successfully');
}
}
}
async function runTests() {
console.log('🚀 Starting Qwen model tests...\n');
const connectionSuccess = await testOllamaConnection();
if (!connectionSuccess) {
console.log('❌ Stopping tests due to connection failure');
return;
}
await testDataCleaning();
}
// Run tests if this file is executed directly
if (require.main === module) {
runTests().catch(console.error);
}

View file

@ -0,0 +1,15 @@
const BusinessList = ({ businesses }: { businesses: BusinessData[] }) => {
console.log('Rendering BusinessList with:', businesses);
if (!businesses.length) {
return <div>No businesses found</div>;
}
return (
<div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-4">
{businesses.map(business => (
<BusinessCard key={business.id} business={business} />
))}
</div>
);
};

View file

@ -0,0 +1,148 @@
import React, { useState } from 'react';
import { Business } from '../../types/business';
interface Props {
businesses: Business[];
onExport: (format: 'csv' | 'json') => void;
onSearch: (query: string) => void;
}
export const BusinessResults: React.FC<Props> = ({ businesses, onExport, onSearch }) => {
const [error, setError] = useState<string | null>(null);
const [loading, setLoading] = useState(false);
const [progress, setProgress] = useState({ status: '', percent: 0 });
const [searchResults, setSearchResults] = useState<Business[]>([]);
const handleSearchResponse = (data: any) => {
console.log('Received search response:', data);
if (data.type === 'error') {
setError(data.error);
setLoading(false);
return;
}
if (data.type === 'progress') {
setProgress({ status: data.status, percent: data.progress });
return;
}
if (data.type === 'results') {
console.log('Setting results:', data.results);
setSearchResults(data.results);
onSearch(data.results); // Pass results up to parent
setLoading(false);
}
};
const handleSearch = async (query: string) => {
setLoading(true);
setError(null);
setProgress({ status: 'Starting search...', percent: 0 });
try {
const response = await fetch(
`http://localhost:3000/api/search?q=${encodeURIComponent(query)}`,
{
headers: {
Accept: 'application/json',
'Cache-Control': 'no-cache'
}
}
);
const reader = response.body?.getReader();
if (!reader) throw new Error('No response body');
const decoder = new TextDecoder();
let buffer = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split('\n');
buffer = lines.pop() || '';
for (const line of lines) {
try {
if (line.trim()) {
const data = JSON.parse(line);
handleSearchResponse(data);
}
} catch (e) {
console.error('Error parsing JSON:', e, 'Line:', line);
}
}
}
} catch (error) {
console.error('Search error:', error);
setError('Failed to fetch results');
setLoading(false);
}
};
return (
<div className="business-results">
<div className="search-controls">
<input
type="text"
placeholder="Search businesses..."
onKeyPress={(e) => {
if (e.key === 'Enter') {
handleSearch(e.currentTarget.value);
}
}}
/>
{loading && (
<div className="progress">
{progress.status} ({progress.percent}%)
</div>
)}
{error && (
<div className="error">
{error}
</div>
)}
</div>
<div className="export-controls">
<button onClick={() => onExport('csv')}>Export CSV</button>
<button onClick={() => onExport('json')}>Export JSON</button>
</div>
<table className="business-table">
<thead>
<tr>
<th>Business Name</th>
<th>Contact</th>
<th>Address</th>
<th>Rating</th>
<th>Website</th>
</tr>
</thead>
<tbody>
{(searchResults.length ? searchResults : businesses).map(business => (
<tr key={business.id}>
<td>{business.name}</td>
<td>
{business.phone}<br/>
{business.email}
</td>
<td>{business.address}</td>
<td>{business.rating}/5</td>
<td>
{business.website && (
<a href={business.website} target="_blank" rel="noopener noreferrer">
Visit Website
</a>
)}
</td>
</tr>
))}
</tbody>
</table>
</div>
);
};

4451
yarn.lock

File diff suppressed because it is too large Load diff