Re: Semantic Linking saves tedious work

Jean Louis Sat, 26 Apr 2025 23:09:33 -0700

* Ihor Radchenko <yanta...@posteo.net> [2025-04-26 20:59]:
> Jean Louis <bugs@gnu.support> writes:
> 
> >> Is the source code available anywhere?
> >
> > Of course it is available on request but my system is too complex. I would 
> > like to make the simpler system and first I need to find some binary vector 
> > database or easiest way to install the vector database maybe with the 
> > PostgreSQL.
> >
> > postgresql-17-pgvector is used, so it is available on every system.
> >
> > To make it run for all users without complexities, I would first need to 
> > make the database initiation. 
> >
> > - then there is text chunking application in Python
> 
> Do you parse Org files for chunking? What is the chunking strategy?


Yes, I parse by headings. That may not be as best.

(defun rcd-process-org-headings-to-embeddings ()
  "Process all Org headings, generate embeddings, and store them in the 
database."
  (interactive)
  (let* ((file (buffer-file-name))
         (file-id (rcd-db-files-insert file))
         (headings (rcd-org-get-headings-with-contents))
         (total (length headings))
         (count 0))
    (when headings (rcd-sql "DELETE FROM embeddings WHERE embeddings_files = 
$1" rcd-db file-id))
    (while headings
      (let* ((heading (pop headings))
             (level (plist-get heading :level))
             (heading-text (plist-get heading :heading))
             (contents (plist-get heading :contents))
             (input (concat heading-text "\n" contents))
             (embeddings (rcd-llm-get-embedding input nil "search_document: ")))
        (rcd-db-embeddings-insert nil embeddings input 6 heading-text file-id)
        (rcd-message "Processed heading `%s'" heading-text)
    (rcd-message "Finished processing Org file `%s'" file)))))

(defun rcd-org-get-headings-with-contents ()
  "Return a list of all Org headings with their contents.
Each element is a plist with :heading (title), :level, and :contents (text)."
  (let ((tree (org-element-parse-buffer 'headline)))
    (org-element-map tree 'headline
      (lambda (hl)
        (let* ((beg (org-element-property :begin hl))
               (end (org-element-property :end hl))
               (contents (when (org-element-property :contents-begin hl)
                           (buffer-substring-no-properties
                            (org-element-property :contents-begin hl)
                            (org-element-property :contents-end hl)))))
          (list :level (org-element-property :level hl)
                :heading (org-element-property :raw-value hl)
                :contents (or contents "")
                :begin beg
                :end end))))))

(defun rcd-db-files-insert (file)
  "Inserts a FILE name into the 'files' table and returns its ID.

If the file already exists, does nothing."
  (rcd-sql-first "WITH ins AS (INSERT INTO files (files_name)
                               VALUES ($1)
                               ON CONFLICT (files_name)
                               DO NOTHING
                               RETURNING files_id)
                   SELECT files_id FROM ins
                   UNION ALL
                   SELECT files_id FROM files WHERE files_name = $1"
                 rcd-db file))

(defun rcd-llm-get-embedding (input &optional max-tokens chunk-prefix)
  "Fetch embeddings for INPUT using the specified MODEL and API-KEY.

If INPUT is larger than CHUNK-SIZE (default: 8190), it is split into chunks.

Returns a list of pgvector-compatible JSON array strings (e.g., '(\"[0.1, 0.2, 
0.3]\" \"[0.4, 0.5, 0.6]\"))."
  (let* ((max-tokens (or max-tokens 700))
         (chunks (rcd-llm-chunk-text input max-tokens)))
    (seq-remove (lambda (entry) (null (cadr entry)))
                (mapcar (lambda (chunk) 
                          (let ((chunk (cond (chunk-prefix (concat chunk-prefix 
chunk))
                                             (t chunk))))
                            (list chunk (rcd-llm-get-embedding-single chunk))))
                          chunks))))

(defun rcd-llm-get-embedding-single (input)
  "Fetch a single embedding for INPUT using the specified MODEL and API-KEY."
  (let* ((url "http://192.168.1.68:9999/v1/embeddings";) ; Replace with your API 
endpoint
        (model "any")
        (url-request-method "POST")
        (url-request-extra-headers
         `(("Content-Type" . "application/json")
           ("Authorization" . ,(concat "Bearer " "any"))))
        (url-request-data
         (encode-coding-string
          (json-encode
           `((model . ,model)
             (input . ,input)))
          'utf-8)))
    (with-current-buffer (url-retrieve-synchronously url)
      (goto-char (point-min))
      (re-search-forward "^$") ; Skip HTTP headers
      (let* ((json-data (json-read-from-string (buffer-substring-no-properties 
(point) (point-max))))
             (data (alist-get 'data json-data)))
        (cond (data
               (let* ((embedding (alist-get 'embedding (aref data 0)))
                      (embedding-str (json-encode embedding))) ; Convert list 
to JSON array string
                 embedding-str)) ; Return the pgvector-compatible JSON array 
string
              (t nil))))))

(defun rcd-db-embeddings-insert (id embeddings input type &optional name file)
  (cond (id (rcd-sql "DELETE FROM embeddings WHERE embeddings_embeddingtypes = 
$2 AND embeddings_referencedid = $1" rcd-db id type)))
  (let* ((table (rcd-db-get-entry "embeddingtypes" "embeddingtypes_table" type 
rcd-db))
         (uuid (when id (rcd-db-get-entry table (format "%s_uuid" table) id 
rcd-db))))
    (while embeddings
      (let* ((embedding (pop embeddings))
             (text (car embedding))
             (embedding (cadr embedding))
             (sql (format "INSERT INTO embeddings (embeddings_referencedid, 
embeddings_referenceduuid, embeddings_embeddingtypes, embeddings_embeddings, 
embeddings_text, embeddings_name, embeddings_files) VALUES (%s, %s, $1, $2, $3, 
%s, %s) RETURNING embeddings_id" 
                          (or id "DEFAULT")
                          (cond (uuid (sql-escape-string uuid))
                                (t "DEFAULT"))
                          (cond (name (sql-escape-string name))
                                (t "DEFAULT"))
                          (cond (file file)
                                (t "DEFAULT"))))
             (id (rcd-sql-first sql rcd-db type embedding text)))
        (rcd-message "Embeddings ID %s inserted." id)))))

(defun rcd-llm-file-list-embeddings (&optional file-id)
  ""
  (interactive)
  (let ((file-id (or file-id (rcd-db-files-exists (buffer-file-name)))))
    (when file-id
      (let ((id-list (rcd-sql-list "SELECT embeddings_id FROM embeddings WHERE 
embeddings_files = $1 ORDER BY embeddings_chunkid" rcd-db file-id)))
        (when id-list
          (let ((title (format "Embeddings for file ID %s" file-id))
                (sql (format "SELECT embeddings_id, embeddings_name, 
coalesce(embeddings_chunkid::text, 'nil') FROM embeddings WHERE 
embeddings_files = %s ORDER BY embeddings_id" file-id)))
            (rcd-db-sql-report title sql 
                               [("ID" 10 
rcd-tabulated-number-as-string-predicate) 
                                ("Heading" 50 t) 
                                ("Chunk ID" 10 t)]
                               "embeddings" '("ID") nil nil (lambda () 
(rcd-org-jump-to-embeddings file-id)))))))))

1. I am running semantic chunker in memory for chunking, find it attached.

> > - and there is embeddings model that runs on llama.cpp or any other 
> > inference serving.
> 
> Which model did you use? How did you integrate it with pgvector?

I am running Nomic Embed Text v1.5 in memory and Nomic Embed Vision (sharing 
vector space)

Those models work on CPU as well. I did not see myself sensible difference.

I don't think I will change from those fast and speedy Nomic models to
something larger. No need. That they use the same vector space it
provides kind of magic, you can index Org files and find relevant
pictures by text, or find text by pictures, etc.

SQL file in attachment is more or less example, there is more development in 
mean time.

How is integrated with pg vector?

(defun rcd-org-semantic-link (&optional query)
  (interactive)
  (let* ((query (or query (rcd-ask-get "Query semantically: ")))
         (headings (rcd-llm-file-chunks-by-embeddings query 6)))
    (when headings
      (let* ((heading (cadar headings))
             (file (rcd-db-get-entry "files" "files_name" (caddar headings) 
rcd-db))
             (link (format "[[file:%s::%s]]" file heading)))
    (org-link-open-from-string link)))))

(defun rcd-llm-file-chunks-by-embeddings (&optional query type similarity limit)
  "Retrieve and return context similar to QUERY from the `embeddings' table
filtered by FILES-ID. Returns list of ((files-id embeddings_name) ...) pairs."
  (interactive)
  (let* ((query (or query (rcd-ask-get "Query: ")))
         (query-embeddings (rcd-llm-get-embedding-single query))
         (similarity (or similarity 0.5))
         (limit (or limit 10))
         (results (rcd-sql-list 
                  (format "SELECT e.embeddings_id, e.embeddings_name, 
e.embeddings_files
                           FROM (
                               SELECT embeddings_id,
                                      embeddings_name,
                                      embeddings_files,
                                      embeddings_embeddings <=> $1 AS similarity
                               FROM embeddings
                               WHERE embeddings_embeddings <=> $1 < $2
                                 AND embeddings_embeddingtypes = $3
                               ORDER BY similarity ASC
                               LIMIT $4
                           ) AS e
                           ORDER BY e.similarity ASC")
                  rcd-db query-embeddings similarity type limit)))
    results))

That is for you to understand. 

I have to avoid choosing the same file/heading (self-linking).

> > Elisp is easy.
> 
> Yet interesting. It is clear that you heavily tailored your code to your
> use case

Not at all.

> but the code itself could serve as inspiration for
> others. Maybe someone can even write a package to be useful for all the
> users.

Exactly. And EKG has something similar. 
https://www.youtube.com/watch?v=qxa2VrseFUA

> > Prerequisits are rather more complex to install.
> >
> > The power that creats is great. Links can be expanded in different ways for 
> > different outputs (I guess)
> 
> No doubt. You may even have special link types that will use the vector
> search + extra LLM filtering technique with additional prompt.

-- 
Jean Louis

rcd-semantic-split-server.sh
Description: Bourne shell script

from fastapi import Body, FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
import tiktoken
import re
from typing import List, Dict

app = FastAPI()

# Constants
MAX_INPUT_LENGTH = 1000000  # ~1MB of text
BATCH_SIZE = 100000  # Increased batch size for better performance

# Pre-compile regex patterns for better performance
REPEAT_CHARS = re.compile(r'(.)\1{2,}')  # For chars like ---, ===
BOX_CHARS = re.compile(r'[─━│┃┄┅┆┇┈┉┊┋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬]+')

def clean_text(text: str) -> str:
    """Clean text without any HTML parsing"""
    # Reduce repetitive characters (3+ repeats down to 3)
    text = REPEAT_CHARS.sub(r'\1\1\1', text)
    
    # Replace box-drawing characters with simple dashes
    text = BOX_CHARS.sub('---', text)
    
    # Normalize whitespace
    return ' '.join(text.split())

def chunk_text(text: str, max_tokens: int = 512, overlap: int = 50) -> List[Dict]:
    """Efficient chunking with token awareness"""
    enc = tiktoken.get_encoding("cl100k_base")
    tokens = enc.encode(text)
    chunks = []
    
    for i in range(0, len(tokens), max_tokens - overlap):
        chunk_tokens = tokens[i:i + max_tokens]
        chunks.append({
            "text": enc.decode(chunk_tokens),
            "tokens": len(chunk_tokens),
            "start_token": i,
            "end_token": i + len(chunk_tokens)
        })
    
    return chunks

@app.post("/chunk")
async def chunk_file(
    file: UploadFile = File(...),
    max_tokens: int = 512,
    overlap: int = 50
):
    if not file.content_type.startswith('text/'):
        raise HTTPException(400, "Only text files accepted")
    
    try:
        text = (await file.read()).decode('utf-8')
        if len(text) > MAX_INPUT_LENGTH:
            raise HTTPException(413, f"Input too large. Max {MAX_INPUT_LENGTH} chars allowed")
        
        cleaned_text = clean_text(text)
        chunks = chunk_text(cleaned_text, max_tokens, overlap)
        return JSONResponse({
            "filename": file.filename,
            "total_chunks": len(chunks),
            "chunks": chunks
        })
    except Exception as e:
        raise HTTPException(500, f"Processing error: {str(e)}")

@app.post("/chunk_text")
async def chunk_raw_text(
    text: str = Body(..., embed=True),
    max_tokens: int = Body(512),
    overlap: int = Body(50)
):
    try:
        if len(text) > MAX_INPUT_LENGTH:
            raise HTTPException(413, f"Input too large. Max {MAX_INPUT_LENGTH} chars allowed")
        
        cleaned_text = clean_text(text)
        chunks = chunk_text(cleaned_text, max_tokens, overlap)
        return JSONResponse({
            "total_chunks": len(chunks),
            "chunks": chunks
        })
    except Exception as e:
        raise HTTPException(500, f"Error: {str(e)}")

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8201)
    
# from fastapi import Body, FastAPI, UploadFile, File, HTTPException
# from fastapi.responses import JSONResponse
# import tiktoken
# import re
# from bs4 import BeautifulSoup
# from typing import List, Dict

# app = FastAPI()

# # Constants
# MAX_INPUT_LENGTH = 1000000
# BATCH_SIZE = 10000

# def preprocess_content(text: str) -> str:
#     """Handle HTML and repetitive characters"""
#     # HTML/XML content extraction
#     if re.search(r'<[a-z][\s>]', text[:1000], re.I):
#         try:
#             soup = BeautifulSoup(text, 'html.parser')
#             # Preserve preformatted text
#             for pre in soup.find_all('pre'):
#                 pre.replace_with('\n' + pre.get_text() + '\n')
#             text = soup.get_text()
#         except:
#             text = re.sub(r'<[^>]+>', '', text)  # Fallback HTML tag removal

#     # Reduce repetitive characters (3+ repeats down to 3)
#     text = re.sub(r'(.)\1{2,}', r'\1\1\1', text)  # For chars like ---, ===
    
#     # Special handling for Emacs Lisp box-drawing chars
#     text = re.sub(r'[─━│┃┄┅┆┇┈┉┊┋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬]+', '---', text)
    
#     return re.sub(r'\s+', ' ', text).strip()

# def chunk_text(text: str, max_tokens: int = 512, overlap: int = 50) -> List[Dict]:
#     """Smart chunking preserving code structure"""
#     enc = tiktoken.get_encoding("cl100k_base")
#     tokens = enc.encode(text)
#     chunks = []
    
#     # Find natural breakpoints (comments, defuns, etc.)
#     separators = [';;;', ';;;;', '(defun', '(defvar', '(require', '\n\n']
#     separator_indices = [
#         i for i, token in enumerate(tokens)
#         if any(enc.decode(tokens[i:i+len(s)]) == s for s in separators)
#     ]
    
#     for i in range(0, len(tokens), max_tokens - overlap):
#         end = min(i + max_tokens, len(tokens))
        
#         # Adjust to nearest semantic break
#         if separator_indices:
#             end = max(
#                 i + int(max_tokens * 0.7),  # Minimum chunk size
#                 min([x for x in separator_indices if i <= x <= end] or [end])
#             )
        
#         chunks.append({
#             "text": enc.decode(tokens[i:end]),
#             "tokens": end - i,
#             "start_token": i,
#             "end_token": end
#         })
    
#     return chunks

# def process_large_text(text: str, max_tokens: int, overlap: int) -> List[Dict]:
#     """Process text with format awareness"""
#     processed_text = preprocess_content(text)
    
#     if len(processed_text) <= BATCH_SIZE:
#         return chunk_text(processed_text, max_tokens, overlap)
    
#     # Split at major section breaks
#     batches = re.split(r'(;;;+|\n{3,}|\(defun)', processed_text)
#     all_chunks = []
#     token_offset = 0
    
#     for batch in batches:
#         if not batch.strip():
#             continue
            
#         batch_chunks = chunk_text(batch, max_tokens, overlap)
#         for chunk in batch_chunks:
#             chunk['start_token'] += token_offset
#             chunk['end_token'] += token_offset
#         all_chunks.extend(batch_chunks)
#         token_offset = all_chunks[-1]['end_token'] if all_chunks else 0
    
#     return all_chunks

# @app.post("/chunk")
# async def chunk_file(
#     file: UploadFile = File(...),
#     max_tokens: int = 512,
#     overlap: int = 50
# ):
#     if not file.content_type.startswith(('text/', 'application/xml', 'application/html')):
#         raise HTTPException(400, "Only text/HTML/XML files accepted")
    
#     try:
#         text = (await file.read()).decode('utf-8')
#         if len(text) > MAX_INPUT_LENGTH:
#             raise HTTPException(413, f"Input too large. Max {MAX_INPUT_LENGTH} chars allowed")
        
#         chunks = process_large_text(text, max_tokens, overlap)
#         return JSONResponse({
#             "filename": file.filename,
#             "total_chunks": len(chunks),
#             "chunks": chunks,
#             "preprocessing_applied": True
#         })
#     except Exception as e:
#         raise HTTPException(500, f"Processing error: {str(e)}")

# @app.post("/chunk_text")
# async def chunk_raw_text(
#     text: str = Body(..., embed=True),
#     max_tokens: int = Body(512),
#     overlap: int = Body(50)
# ):
#     try:
#         if len(text) > MAX_INPUT_LENGTH:
#             raise HTTPException(413, f"Input too large. Max {MAX_INPUT_LENGTH} chars allowed")
        
#         chunks = process_large_text(text, max_tokens, overlap)
#         return JSONResponse({
#             "total_chunks": len(chunks),
#             "chunks": chunks,
#             "preprocessing_applied": True
#         })
#     except Exception as e:
#         raise HTTPException(500, f"Error: {str(e)}")

# if __name__ == "__main__":
#     import uvicorn
#     uvicorn.run(app, host="0.0.0.0", port=8201)

rcd-llm-start-embeddings-model.sh
Description: Bourne shell script

rcd-org-semantic-link.sql
Description: application/sql

Re: Semantic Linking saves tedious work

Reply via email to