feat(server): replace existing search functionality with searxng library

This commit is contained in:
Zan 2024-07-31 11:23:39 +08:00
parent 96f67c7028
commit 28077018a6
11 changed files with 460 additions and 519 deletions

View file

@ -41,6 +41,7 @@
"html-to-text": "^9.0.5",
"langchain": "^0.1.30",
"pdf-parse": "^1.1.1",
"searxng": "^0.0.5",
"winston": "^3.13.0",
"ws": "^8.17.1",
"zod": "^3.22.4"

View file

@ -52,8 +52,8 @@ const basicAcademicSearchResponsePrompt = `
Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.
Anything inside the following \`context\` HTML block provided below is for your knowledge returned by the search engine and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
talk about the context in your response.
Anything inside the following \`context\` HTML block provided below is for your knowledge returned by the search engine and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
talk about the context in your response.
<context>
{context}
@ -113,11 +113,11 @@ const createBasicAcademicSearchRetrieverChain = (llm: BaseChatModel) => {
}
const res = await searchSearxng(input, {
language: 'en',
lang: 'en',
engines: [
'arxiv',
'google scholar',
'internetarchivescholar',
'google_scholar',
'internet_archive_scholar',
'pubmed',
],
});

View file

@ -53,7 +53,7 @@ const createImageSearchChain = (llm: BaseChatModel) => {
strParser,
RunnableLambda.from(async (input: string) => {
const res = await searchSearxng(input, {
engines: ['bing images', 'google images'],
engines: ['bing_images', 'google_images'],
});
const images = [];

View file

@ -113,7 +113,7 @@ const createBasicRedditSearchRetrieverChain = (llm: BaseChatModel) => {
}
const res = await searchSearxng(input, {
language: 'en',
lang: 'en',
engines: ['reddit'],
});

View file

@ -13,20 +13,20 @@ import type { BaseChatModel } from '@langchain/core/language_models/chat_models'
const VideoSearchChainPrompt = `
You will be given a conversation below and a follow up question. You need to rephrase the follow-up question so it is a standalone question that can be used by the LLM to search Youtube for videos.
You need to make sure the rephrased question agrees with the conversation and is relevant to the conversation.
Example:
1. Follow up question: How does a car work?
Rephrased: How does a car work?
2. Follow up question: What is the theory of relativity?
Rephrased: What is theory of relativity
3. Follow up question: How does an AC work?
Rephrased: How does an AC work
Conversation:
{chat_history}
Follow up question: {query}
Rephrased question:
`;
@ -53,7 +53,7 @@ const createVideoSearchChain = (llm: BaseChatModel) => {
strParser,
RunnableLambda.from(async (input: string) => {
const res = await searchSearxng(input, {
engines: ['youtube'],
engines: ['youtube_api'],
});
const videos = [];

View file

@ -186,7 +186,7 @@ const createBasicWebSearchRetrieverChain = (llm: BaseChatModel) => {
await Promise.all(docGroups.map(async (doc) => {
const res = await llm.invoke(`
You are a text summarizer. You need to summarize the text provided inside the \`text\` XML block.
You are a text summarizer. You need to summarize the text provided inside the \`text\` XML block.
You need to summarize the text into 1 or 2 sentences capturing the main idea of the text.
You need to make sure that you don't miss any point while summarizing the text.
You will also be given a \`query\` XML block which will contain the query of the user. Try to answer the query in the summary from the text provided.
@ -211,14 +211,14 @@ const createBasicWebSearchRetrieverChain = (llm: BaseChatModel) => {
url: doc.metadata.url,
},
})
docs.push(document)
}))
return { query: question, docs: docs };
} else {
const res = await searchSearxng(input, {
language: 'en',
lang: 'en',
});
const documents = res.results.map(

View file

@ -51,8 +51,8 @@ const basicWolframAlphaSearchResponsePrompt = `
Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.
Anything inside the following \`context\` HTML block provided below is for your knowledge returned by Wolfram Alpha and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
talk about the context in your response.
Anything inside the following \`context\` HTML block provided below is for your knowledge returned by Wolfram Alpha and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
talk about the context in your response.
<context>
{context}
@ -112,8 +112,8 @@ const createBasicWolframAlphaSearchRetrieverChain = (llm: BaseChatModel) => {
}
const res = await searchSearxng(input, {
language: 'en',
engines: ['wolframalpha'],
lang: 'en',
engines: ['wolframalpha_api'],
});
const documents = res.results.map(

View file

@ -113,8 +113,8 @@ const createBasicYoutubeSearchRetrieverChain = (llm: BaseChatModel) => {
}
const res = await searchSearxng(input, {
language: 'en',
engines: ['youtube'],
lang: 'en',
engines: ['youtube_api'],
});
const documents = res.results.map(

View file

@ -1,47 +1,19 @@
import axios from 'axios';
import { getSearxngApiEndpoint } from '../config';
interface SearxngSearchOptions {
categories?: string[];
engines?: string[];
language?: string;
pageno?: number;
}
import { SearxngService, type SearxngSearchParameters } from 'searxng';
interface SearxngSearchResult {
title: string;
url: string;
img_src?: string;
thumbnail_src?: string;
thumbnail?: string;
content?: string;
author?: string;
iframe_src?: string;
}
const searxng = new SearxngService({
baseURL: getSearxngApiEndpoint(),
defaultSearchParams: {
format: 'json'
}
})
export const searchSearxng = async (
query: string,
opts?: SearxngSearchOptions,
opts?: SearxngSearchParameters,
) => {
const searxngURL = getSearxngApiEndpoint();
const url = new URL(`${searxngURL}/search?format=json`);
url.searchParams.append('q', query);
if (opts) {
Object.keys(opts).forEach((key) => {
if (Array.isArray(opts[key])) {
url.searchParams.append(key, opts[key].join(','));
return;
}
url.searchParams.append(key, opts[key]);
});
}
const res = await axios.get(url.toString());
const results: SearxngSearchResult[] = res.data.results;
const suggestions: string[] = res.data.suggestions;
const { results, suggestions } = await searxng.search(query, opts);
return { results, suggestions };
};

View file

@ -1,8 +1,8 @@
{
"compilerOptions": {
"lib": ["ESNext"],
"module": "Node16",
"moduleResolution": "Node16",
"module": "ESNext",
"moduleResolution": "Node",
"target": "ESNext",
"outDir": "dist",
"sourceMap": false,

884
yarn.lock

File diff suppressed because it is too large Load diff