feat(server): replace existing search functionality with searxng library

2024-07-31 11:23:39 +08:00 · 2024-07-31 11:23:39 +08:00 · 28077018a6
commit 28077018a6
parent 96f67c7028
11 changed files with 460 additions and 519 deletions
--- a/package.json
+++ b/package.json
@ -41,6 +41,7 @@
    "html-to-text": "^9.0.5",
    "langchain": "^0.1.30",
    "pdf-parse": "^1.1.1",
+    "searxng": "^0.0.5",
    "winston": "^3.13.0",
    "ws": "^8.17.1",
    "zod": "^3.22.4"
--- a/src/agents/academicSearchAgent.ts
+++ b/src/agents/academicSearchAgent.ts
@ -52,8 +52,8 @@ const basicAcademicSearchResponsePrompt = `
    Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
    However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.

-    Anything inside the following \`context\` HTML block provided below is for your knowledge returned by the search engine and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to 
-    talk about the context in your response. 
+    Anything inside the following \`context\` HTML block provided below is for your knowledge returned by the search engine and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
+    talk about the context in your response.

    <context>
    {context}
@ -113,11 +113,11 @@ const createBasicAcademicSearchRetrieverChain = (llm: BaseChatModel) => {
      }

      const res = await searchSearxng(input, {
-        language: 'en',
+        lang: 'en',
        engines: [
          'arxiv',
-          'google scholar',
-          'internetarchivescholar',
+          'google_scholar',
+          'internet_archive_scholar',
          'pubmed',
        ],
      });
--- a/src/agents/imageSearchAgent.ts
+++ b/src/agents/imageSearchAgent.ts
@ -53,7 +53,7 @@ const createImageSearchChain = (llm: BaseChatModel) => {
    strParser,
    RunnableLambda.from(async (input: string) => {
      const res = await searchSearxng(input, {
-        engines: ['bing images', 'google images'],
+        engines: ['bing_images', 'google_images'],
      });

      const images = [];
--- a/src/agents/redditSearchAgent.ts
+++ b/src/agents/redditSearchAgent.ts
@ -113,7 +113,7 @@ const createBasicRedditSearchRetrieverChain = (llm: BaseChatModel) => {
      }

      const res = await searchSearxng(input, {
-        language: 'en',
+        lang: 'en',
        engines: ['reddit'],
      });

--- a/src/agents/videoSearchAgent.ts
+++ b/src/agents/videoSearchAgent.ts
@ -13,20 +13,20 @@ import type { BaseChatModel } from '@langchain/core/language_models/chat_models'
 const VideoSearchChainPrompt = `
  You will be given a conversation below and a follow up question. You need to rephrase the follow-up question so it is a standalone question that can be used by the LLM to search Youtube for videos.
  You need to make sure the rephrased question agrees with the conversation and is relevant to the conversation.
-  
+
  Example:
  1. Follow up question: How does a car work?
  Rephrased: How does a car work?
-  
+
  2. Follow up question: What is the theory of relativity?
  Rephrased: What is theory of relativity
-  
+
  3. Follow up question: How does an AC work?
  Rephrased: How does an AC work
-  
+
  Conversation:
  {chat_history}
-  
+
  Follow up question: {query}
  Rephrased question:
  `;
@ -53,7 +53,7 @@ const createVideoSearchChain = (llm: BaseChatModel) => {
    strParser,
    RunnableLambda.from(async (input: string) => {
      const res = await searchSearxng(input, {
-        engines: ['youtube'],
+        engines: ['youtube_api'],
      });

      const videos = [];
--- a/src/agents/webSearchAgent.ts
+++ b/src/agents/webSearchAgent.ts
@ -186,7 +186,7 @@ const createBasicWebSearchRetrieverChain = (llm: BaseChatModel) => {

        await Promise.all(docGroups.map(async (doc) => {
          const res = await llm.invoke(`
-            You are a text summarizer. You need to summarize the text provided inside the \`text\` XML block. 
+            You are a text summarizer. You need to summarize the text provided inside the \`text\` XML block.
            You need to summarize the text into 1 or 2 sentences capturing the main idea of the text.
            You need to make sure that you don't miss any point while summarizing the text.
            You will also be given a \`query\` XML block which will contain the query of the user. Try to answer the query in the summary from the text provided.
@ -211,14 +211,14 @@ const createBasicWebSearchRetrieverChain = (llm: BaseChatModel) => {
              url: doc.metadata.url,
            },
          })
-          
+
          docs.push(document)
        }))

        return { query: question, docs: docs };
      } else {
        const res = await searchSearxng(input, {
-          language: 'en',
+          lang: 'en',
        });

        const documents = res.results.map(
--- a/src/agents/wolframAlphaSearchAgent.ts
+++ b/src/agents/wolframAlphaSearchAgent.ts
@ -51,8 +51,8 @@ const basicWolframAlphaSearchResponsePrompt = `
    Place these citations at the end of that particular sentence. You can cite the same sentence multiple times if it is relevant to the user's query like [number1][number2].
    However you do not need to cite it using the same number. You can use different numbers to cite the same sentence multiple times. The number refers to the number of the search result (passed in the context) used to generate that part of the answer.

-    Anything inside the following \`context\` HTML block provided below is for your knowledge returned by Wolfram Alpha and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to 
-    talk about the context in your response. 
+    Anything inside the following \`context\` HTML block provided below is for your knowledge returned by Wolfram Alpha and is not shared by the user. You have to answer question on the basis of it and cite the relevant information from it but you do not have to
+    talk about the context in your response.

    <context>
    {context}
@ -112,8 +112,8 @@ const createBasicWolframAlphaSearchRetrieverChain = (llm: BaseChatModel) => {
      }

      const res = await searchSearxng(input, {
-        language: 'en',
-        engines: ['wolframalpha'],
+        lang: 'en',
+        engines: ['wolframalpha_api'],
      });

      const documents = res.results.map(
--- a/src/agents/youtubeSearchAgent.ts
+++ b/src/agents/youtubeSearchAgent.ts
@ -113,8 +113,8 @@ const createBasicYoutubeSearchRetrieverChain = (llm: BaseChatModel) => {
      }

      const res = await searchSearxng(input, {
-        language: 'en',
-        engines: ['youtube'],
+        lang: 'en',
+        engines: ['youtube_api'],
      });

      const documents = res.results.map(
--- a/src/lib/searxng.ts
+++ b/src/lib/searxng.ts
@ -1,47 +1,19 @@
-import axios from 'axios';
 import { getSearxngApiEndpoint } from '../config';

-interface SearxngSearchOptions {
-  categories?: string[];
-  engines?: string[];
-  language?: string;
-  pageno?: number;
-}
+import { SearxngService, type SearxngSearchParameters } from 'searxng';

-interface SearxngSearchResult {
-  title: string;
-  url: string;
-  img_src?: string;
-  thumbnail_src?: string;
-  thumbnail?: string;
-  content?: string;
-  author?: string;
-  iframe_src?: string;
-}
+const searxng = new SearxngService({
+  baseURL: getSearxngApiEndpoint(),
+  defaultSearchParams: {
+    format: 'json'
+  }
+})

 export const searchSearxng = async (
  query: string,
-  opts?: SearxngSearchOptions,
+  opts?: SearxngSearchParameters,
 ) => {
-  const searxngURL = getSearxngApiEndpoint();
-
-  const url = new URL(`${searxngURL}/search?format=json`);
-  url.searchParams.append('q', query);
-
-  if (opts) {
-    Object.keys(opts).forEach((key) => {
-      if (Array.isArray(opts[key])) {
-        url.searchParams.append(key, opts[key].join(','));
-        return;
-      }
-      url.searchParams.append(key, opts[key]);
-    });
-  }
-
-  const res = await axios.get(url.toString());
-
-  const results: SearxngSearchResult[] = res.data.results;
-  const suggestions: string[] = res.data.suggestions;

+  const { results, suggestions } = await searxng.search(query, opts);
  return { results, suggestions };
 };
--- a/tsconfig.json
+++ b/tsconfig.json
@ -1,8 +1,8 @@
 {
  "compilerOptions": {
    "lib": ["ESNext"],
-    "module": "Node16",
-    "moduleResolution": "Node16",
+    "module": "ESNext",
+    "moduleResolution": "Node",
    "target": "ESNext",
    "outDir": "dist",
    "sourceMap": false,
--- a/yarn.lock
+++ b/yarn.lock