readysite / website / internal / assist / filecache.go
3.5 KB
filecache.go
package assist

import (
	"crypto/sha256"
	"encoding/hex"
	"fmt"
	"strings"

	"github.com/readysite/readysite/website/models"
)

// File content caching limits
const (
	MaxCachedFileSize   = 100 * 1024 // 100KB max file size to cache
	MaxCachedTextLength = 10000      // Max characters in cached text
	TokenEstimateRatio  = 4          // Approximate characters per token
)

// GetFileContent returns the text content of a file, using cache if available.
// Returns an error if the file is too large, not text-based, or doesn't exist.
func GetFileContent(fileID string) (string, error) {
	// Get the file
	file, err := models.Files.Get(fileID)
	if err != nil {
		return "", fmt.Errorf("file not found: %w", err)
	}

	// Check if file is text-based
	if !file.IsText() {
		return "", fmt.Errorf("file is not text-based (type: %s)", file.MimeType)
	}

	// Check file size
	if file.Size > MaxCachedFileSize {
		return "", fmt.Errorf("file too large (%d bytes, max %d)", file.Size, MaxCachedFileSize)
	}

	// Compute content hash
	hash := computeHash(file.Data)

	// Check cache
	cache, _ := models.FileContentCaches.First("WHERE FileID = ?", fileID)
	if cache != nil && cache.ContentHash == hash {
		// Cache hit
		return cache.TextContent, nil
	}

	// Extract text content
	textContent := extractTextContent(file)

	// Truncate if needed
	if len(textContent) > MaxCachedTextLength {
		textContent = textContent[:MaxCachedTextLength] + "\n...[truncated]"
	}

	// Estimate token count
	tokenCount := len(textContent) / TokenEstimateRatio

	// Update or create cache
	if cache != nil {
		cache.ContentHash = hash
		cache.TextContent = textContent
		cache.TokenCount = tokenCount
		models.FileContentCaches.Update(cache)
	} else {
		cache = &models.FileContentCache{
			FileID:      fileID,
			ContentHash: hash,
			TextContent: textContent,
			TokenCount:  tokenCount,
		}
		models.FileContentCaches.Insert(cache)
	}

	return textContent, nil
}

// InvalidateFileCache removes cached content for a file.
// Call this when a file is modified or deleted.
func InvalidateFileCache(fileID string) {
	cache, _ := models.FileContentCaches.First("WHERE FileID = ?", fileID)
	if cache != nil {
		models.FileContentCaches.Delete(cache)
	}
}

// computeHash computes a SHA-256 hash of the file content.
func computeHash(data []byte) string {
	hash := sha256.Sum256(data)
	return hex.EncodeToString(hash[:])
}

// extractTextContent extracts text from file data.
func extractTextContent(file *models.File) string {
	if file.Data == nil {
		return ""
	}

	content := string(file.Data)

	// Clean up based on file type
	switch file.MimeType {
	case "text/html":
		// For HTML, we could strip tags, but for now keep as-is
		// since the AI might want to see the structure
		return content

	case "application/json":
		// Pretty-print JSON if it's not already
		return content

	default:
		// For other text types, normalize line endings
		content = strings.ReplaceAll(content, "\r\n", "\n")
		content = strings.ReplaceAll(content, "\r", "\n")
		return content
	}
}

// GetCachedTokenCount returns the estimated token count for a file.
// Returns 0 if the file is not cached.
func GetCachedTokenCount(fileID string) int {
	cache, _ := models.FileContentCaches.First("WHERE FileID = ?", fileID)
	if cache != nil {
		return cache.TokenCount
	}
	return 0
}

// IsFileCacheable returns true if a file can be cached for AI context.
func IsFileCacheable(file *models.File) bool {
	if file == nil {
		return false
	}
	return file.IsText() && file.Size <= MaxCachedFileSize
}
← Back