ragflow/internal/tokenizer/tokenizer_concurrent_test.go

//
//  Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
//
//  Licensed under the Apache License, Version 2.0 (the "License");
//  you may not use this file except in compliance with the License.
//  You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
//  Unless required by applicable law or agreed to in writing, software
//  distributed under the License is distributed on an "AS IS" BASIS,
//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//  See the License for the specific language governing permissions and
//  limitations under the License.
//

package tokenizer

import (
	"fmt"
	"runtime"
	"sync"
	"sync/atomic"
	"testing"
	"time"

	"go.uber.org/zap"

	"ragflow/internal/logger"
)

func init() {
	// Initialize logger for tests
	if err := logger.Init("info"); err != nil {
		fmt.Printf("Failed to initialize logger: %v\n", err)
	}
}

// TestConcurrentTokenize tests concurrent tokenization with dynamic pool expansion and shrinking
func TestConcurrentTokenize(t *testing.T) {
	// Use small pool to test expansion
	cfg := &PoolConfig{
		DictPath:       "/usr/share/infinity/resource",
		MinSize:        2,
		MaxSize:        10,
		IdleTimeout:    5 * time.Second,
		AcquireTimeout: 5 * time.Second,
	}

	if err := Init(cfg); err != nil {
		t.Fatalf("Failed to initialize pool: %v", err)
	}
	defer Close()

	// Print initial pool stats
	stats := GetPoolStats()
	t.Logf("Initial pool stats: %+v", stats)

	// Test texts
	texts := []string{
		"Hello world this is a test",
		"Natural language processing is amazing",
		"Elastic pool handles concurrent requests",
		"中文分词测试",
		"深度学习与机器学习",
		"RAGFlow is an open-source RAG engine",
	}

	// Phase 1: High concurrency test - should trigger expansion
	t.Log("=== Phase 1: High concurrency test (should trigger expansion) ===")
	var expansionDetected int32
	var wg sync.WaitGroup
	numGoroutines := 20
	requestsPerGoroutine := 10

	start := time.Now()
	for i := 0; i < numGoroutines; i++ {
		wg.Add(1)
		go func(id int) {
			defer wg.Done()
			for j := 0; j < requestsPerGoroutine; j++ {
				text := texts[(id+j)%len(texts)]
				result, err := Tokenize(text)
				if err != nil {
					t.Errorf("Goroutine %d request %d failed: %v", id, j, err)
					return
				}
				if result == "" {
					t.Errorf("Goroutine %d request %d returned empty result", id, j)
				}

				// Check pool stats periodically
				if j%5 == 0 {
					stats := GetPoolStats()
					currentSize := stats["current_size"].(int32)
					if currentSize > int32(cfg.MinSize) {
						atomic.StoreInt32(&expansionDetected, 1)
					}
				}
			}
		}(i)
	}
	wg.Wait()
	phase1Duration := time.Since(start)

	stats = GetPoolStats()
	t.Logf("Phase 1 completed in %v", phase1Duration)
	t.Logf("Pool stats after Phase 1: %+v", stats)

	if atomic.LoadInt32(&expansionDetected) == 1 {
		t.Log("✓ Pool expansion detected during high concurrency")
	} else {
		t.Log("℗ Pool expansion not detected (may need more concurrency)")
	}

	currentSize := stats["current_size"].(int32)
	if currentSize > int32(cfg.MinSize) {
		t.Logf("✓ Current pool size (%d) is greater than minSize (%d)", currentSize, cfg.MinSize)
	}

	// Phase 2: Wait for idle timeout - should trigger shrinking
	t.Log("=== Phase 2: Waiting for idle timeout (should trigger shrinking) ===")
	t.Logf("Waiting %v for idle instances to timeout...", cfg.IdleTimeout)
	time.Sleep(cfg.IdleTimeout + 2*time.Second)

	stats = GetPoolStats()
	t.Logf("Pool stats after Phase 2 (waiting): %+v", stats)

	currentSize = stats["current_size"].(int32)
	if currentSize <= int32(cfg.MinSize) {
		t.Logf("✓ Pool shrunk back to minSize or below: current=%d, min=%d", currentSize, cfg.MinSize)
	} else {
		t.Logf("℗ Pool not yet shrunk: current=%d, min=%d (may need more time)", currentSize, cfg.MinSize)
	}

	// Phase 3: Moderate concurrency after shrink - should trigger expansion again
	t.Log("=== Phase 3: Moderate concurrency after shrink (should trigger re-expansion) ===")
	var reExpansionDetected int32
	start = time.Now()
	for i := 0; i < numGoroutines/2; i++ {
		wg.Add(1)
		go func(id int) {
			defer wg.Done()
			for j := 0; j < requestsPerGoroutine/2; j++ {
				text := texts[(id+j)%len(texts)]
				_, err := Tokenize(text)
				if err != nil {
					t.Errorf("Phase 3 goroutine %d request %d failed: %v", id, j, err)
					return
				}

				if j%3 == 0 {
					stats := GetPoolStats()
					currentSize := stats["current_size"].(int32)
					if currentSize > int32(cfg.MinSize) {
						atomic.StoreInt32(&reExpansionDetected, 1)
					}
				}
			}
		}(i)
	}
	wg.Wait()
	phase3Duration := time.Since(start)

	stats = GetPoolStats()
	t.Logf("Phase 3 completed in %v", phase3Duration)
	t.Logf("Pool stats after Phase 3: %+v", stats)

	if atomic.LoadInt32(&reExpansionDetected) == 1 {
		t.Log("✓ Pool re-expansion detected after shrink")
	}

	t.Log("=== Test completed successfully ===")
}

// TestConcurrentTokenizeWithPosition tests concurrent tokenization with position info
func TestConcurrentTokenizeWithPosition(t *testing.T) {
	cfg := &PoolConfig{
		DictPath:       "/usr/share/infinity/resource",
		MinSize:        2,
		MaxSize:        8,
		IdleTimeout:    3 * time.Second,
		AcquireTimeout: 5 * time.Second,
	}

	if err := Init(cfg); err != nil {
		t.Fatalf("Failed to initialize pool: %v", err)
	}
	defer Close()

	text := "This is a test sentence for position tracking"
	var wg sync.WaitGroup
	numGoroutines := 15

	t.Log("=== Testing TokenizeWithPosition concurrently ===")
	start := time.Now()

	for i := 0; i < numGoroutines; i++ {
		wg.Add(1)
		go func(id int) {
			defer wg.Done()
			for j := 0; j < 5; j++ {
				tokens, err := TokenizeWithPosition(text)
				if err != nil {
					t.Errorf("Goroutine %d request %d failed: %v", id, j, err)
					return
				}
				if len(tokens) == 0 {
					t.Errorf("Goroutine %d request %d returned empty tokens", id, j)
					return
				}
				// Verify position info
				for _, token := range tokens {
					if token.Text == "" {
						t.Errorf("Goroutine %d request %d returned empty token text", id, j)
						return
					}
					if token.EndOffset <= token.Offset {
						t.Errorf("Goroutine %d request %d has invalid position: offset=%d, end=%d",
							id, j, token.Offset, token.EndOffset)
						return
					}
				}
			}
		}(i)
	}
	wg.Wait()

	duration := time.Since(start)
	stats := GetPoolStats()
	t.Logf("Completed %d goroutines x 5 requests in %v", numGoroutines, duration)
	t.Logf("Final pool stats: %+v", stats)
	t.Log("✓ TokenizeWithPosition concurrent test passed")
}

// TestPoolExhaustion tests pool exhaustion and timeout behavior
func TestPoolExhaustion(t *testing.T) {
	// Very small pool to test exhaustion
	cfg := &PoolConfig{
		DictPath:       "/usr/share/infinity/resource",
		MinSize:        1,
		MaxSize:        2,
		IdleTimeout:    10 * time.Second,
		AcquireTimeout: 500 * time.Millisecond, // Short timeout for faster test
	}

	if err := Init(cfg); err != nil {
		t.Fatalf("Failed to initialize pool: %v", err)
	}
	defer Close()

	t.Log("=== Testing pool exhaustion behavior ===")
	stats := GetPoolStats()
	t.Logf("Initial pool stats: %+v", stats)

	// Use all available instances
	var wg sync.WaitGroup
	barrier := make(chan struct{})
	errors := make(chan error, 10)

	// Launch goroutines that hold instances
	for i := 0; i < 5; i++ {
		wg.Add(1)
		go func(id int) {
			defer wg.Done()
			<-barrier // Wait for signal to start
			_, err := Tokenize("test text")
			if err != nil {
				errors <- fmt.Errorf("goroutine %d: %w", id, err)
			}
		}(i)
	}

	// Release all goroutines at once to create contention
	close(barrier)

	// Wait for all to complete
	wg.Wait()
	close(errors)

	timeoutCount := 0
	for err := range errors {
		if err != nil {
			t.Logf("Expected error from limited pool: %v", err)
			timeoutCount++
		}
	}

	stats = GetPoolStats()
	t.Logf("Final pool stats: %+v", stats)
	t.Logf("Timeout errors: %d (expected with small pool)", timeoutCount)

	if timeoutCount > 0 {
		t.Log("✓ Pool correctly returned timeout errors when exhausted")
	} else {
		t.Log("℗ No timeout errors (pool handled all requests, may be too fast)")
	}
}

// TestFineGrainedTokenizeConcurrent tests concurrent fine-grained tokenization
func TestFineGrainedTokenizeConcurrent(t *testing.T) {
	cfg := &PoolConfig{
		DictPath:       "/usr/share/infinity/resource",
		MinSize:        2,
		MaxSize:        6,
		IdleTimeout:    3 * time.Second,
		AcquireTimeout: 5 * time.Second,
	}

	if err := Init(cfg); err != nil {
		t.Fatalf("Failed to initialize pool: %v", err)
	}
	defer Close()

	tokens := "hello world 中文测试"
	var wg sync.WaitGroup
	numGoroutines := 10

	t.Log("=== Testing FineGrainedTokenize concurrently ===")
	start := time.Now()

	for i := 0; i < numGoroutines; i++ {
		wg.Add(1)
		go func(id int) {
			defer wg.Done()
			for j := 0; j < 5; j++ {
				result, err := FineGrainedTokenize(tokens)
				if err != nil {
					t.Errorf("Goroutine %d request %d failed: %v", id, j, err)
					return
				}
				if result == "" {
					t.Errorf("Goroutine %d request %d returned empty result", id, j)
				}
			}
		}(i)
	}
	wg.Wait()

	duration := time.Since(start)
	stats := GetPoolStats()
	t.Logf("Completed %d goroutines x 5 requests in %v", numGoroutines, duration)
	t.Logf("Final pool stats: %+v", stats)
	t.Log("✓ FineGrainedTokenize concurrent test passed")
}

// TestTermFreqAndTagConcurrent tests concurrent term frequency and tag lookups
func TestTermFreqAndTagConcurrent(t *testing.T) {
	cfg := &PoolConfig{
		DictPath:       "/usr/share/infinity/resource",
		MinSize:        2,
		MaxSize:        6,
		IdleTimeout:    3 * time.Second,
		AcquireTimeout: 5 * time.Second,
	}

	if err := Init(cfg); err != nil {
		t.Fatalf("Failed to initialize pool: %v", err)
	}
	defer Close()

	terms := []string{"hello", "world", "中文", "test", "natural"}
	var wg sync.WaitGroup
	numGoroutines := 10

	t.Log("=== Testing GetTermFreq and GetTermTag concurrently ===")
	start := time.Now()

	for i := 0; i < numGoroutines; i++ {
		wg.Add(1)
		go func(id int) {
			defer wg.Done()
			for j := 0; j < 10; j++ {
				term := terms[(id+j)%len(terms)]
				freq := GetTermFreq(term)
				tag := GetTermTag(term)
				// We don't validate the results as terms may or may not exist in dictionary
				// Just ensuring no panics or errors
				_ = freq
				_ = tag
			}
		}(i)
	}
	wg.Wait()

	duration := time.Since(start)
	stats := GetPoolStats()
	t.Logf("Completed %d goroutines x 10 requests in %v", numGoroutines, duration)
	t.Logf("Final pool stats: %+v", stats)
	t.Log("✓ GetTermFreq and GetTermTag concurrent test passed")
}

// BenchmarkTokenize benchmarks the tokenization performance
func BenchmarkTokenize(b *testing.B) {
	cfg := &PoolConfig{
		DictPath:       "/usr/share/infinity/resource",
		MinSize:        runtime.NumCPU() * 2,
		MaxSize:        runtime.NumCPU() * 4,
		IdleTimeout:    5 * time.Minute,
		AcquireTimeout: 10 * time.Second,
	}

	if err := Init(cfg); err != nil {
		b.Fatalf("Failed to initialize pool: %v", err)
	}
	defer Close()

	text := "This is a benchmark test for tokenization performance with natural language processing"

	// Warm up
	for i := 0; i < 100; i++ {
		Tokenize(text)
	}

	b.ResetTimer()
	b.RunParallel(func(pb *testing.PB) {
		for pb.Next() {
			_, err := Tokenize(text)
			if err != nil {
				b.Errorf("Tokenize failed: %v", err)
			}
		}
	})

	stats := GetPoolStats()
	b.Logf("Final pool stats: %+v", stats)
}

// BenchmarkTokenizeWithPosition benchmarks position-aware tokenization
func BenchmarkTokenizeWithPosition(b *testing.B) {
	cfg := &PoolConfig{
		DictPath:       "/usr/share/infinity/resource",
		MinSize:        runtime.NumCPU() * 2,
		MaxSize:        runtime.NumCPU() * 4,
		IdleTimeout:    5 * time.Minute,
		AcquireTimeout: 10 * time.Second,
	}

	if err := Init(cfg); err != nil {
		b.Fatalf("Failed to initialize pool: %v", err)
	}
	defer Close()

	text := "This is a benchmark test for position-aware tokenization"

	b.ResetTimer()
	b.RunParallel(func(pb *testing.PB) {
		for pb.Next() {
			_, err := TokenizeWithPosition(text)
			if err != nil {
				b.Errorf("TokenizeWithPosition failed: %v", err)
			}
		}
	})
}

// ExampleGetPoolStats demonstrates getting pool statistics
func ExampleGetPoolStats() {
	cfg := &PoolConfig{
		DictPath:       "/usr/share/infinity/resource",
		MinSize:        2,
		MaxSize:        10,
		IdleTimeout:    5 * time.Minute,
		AcquireTimeout: 10 * time.Second,
	}

	if err := Init(cfg); err != nil {
		fmt.Printf("Failed to initialize: %v\n", err)
		return
	}
	defer Close()

	stats := GetPoolStats()
	fmt.Printf("Pool initialized: %v\n", stats["initialized"])
	fmt.Printf("Current size: %d\n", stats["current_size"])
	fmt.Printf("Min size: %d\n", stats["min_size"])
	fmt.Printf("Max size: %d\n", stats["max_size"])

	// Output will vary based on actual initialization
}

// logPoolStats logs pool statistics using the zap logger
func logPoolStats(msg string) {
	stats := GetPoolStats()
	logger.Info(msg,
		zap.Bool("initialized", stats["initialized"].(bool)),
		zap.Int32("current_size", stats["current_size"].(int32)),
		zap.Int("min_size", stats["min_size"].(int)),
		zap.Int("max_size", stats["max_size"].(int)),
		zap.String("idle_timeout", stats["idle_timeout"].(string)),
		zap.Int("instances_available", stats["instances_available"].(int)),
	)
}