1
0
Fork 0
mirror of https://github.com/chrislusf/seaweedfs synced 2025-08-17 01:22:47 +02:00
seaweedfs/weed/admin/maintenance/pending_operations_test.go
Chris Lu 891a2fb6eb
Admin: misc improvements on admin server and workers. EC now works. (#7055)
* initial design

* added simulation as tests

* reorganized the codebase to move the simulation framework and tests into their own dedicated package

* integration test. ec worker task

* remove "enhanced" reference

* start master, volume servers, filer

Current Status
 Master: Healthy and running (port 9333)
 Filer: Healthy and running (port 8888)
 Volume Servers: All 6 servers running (ports 8080-8085)
🔄 Admin/Workers: Will start when dependencies are ready

* generate write load

* tasks are assigned

* admin start wtih grpc port. worker has its own working directory

* Update .gitignore

* working worker and admin. Task detection is not working yet.

* compiles, detection uses volumeSizeLimitMB from master

* compiles

* worker retries connecting to admin

* build and restart

* rendering pending tasks

* skip task ID column

* sticky worker id

* test canScheduleTaskNow

* worker reconnect to admin

* clean up logs

* worker register itself first

* worker can run ec work and report status

but:
1. one volume should not be repeatedly worked on.
2. ec shards needs to be distributed and source data should be deleted.

* move ec task logic

* listing ec shards

* local copy, ec. Need to distribute.

* ec is mostly working now

* distribution of ec shards needs improvement
* need configuration to enable ec

* show ec volumes

* interval field UI component

* rename

* integration test with vauuming

* garbage percentage threshold

* fix warning

* display ec shard sizes

* fix ec volumes list

* Update ui.go

* show default values

* ensure correct default value

* MaintenanceConfig use ConfigField

* use schema defined defaults

* config

* reduce duplication

* refactor to use BaseUIProvider

* each task register its schema

* checkECEncodingCandidate use ecDetector

* use vacuumDetector

* use volumeSizeLimitMB

* remove

remove

* remove unused

* refactor

* use new framework

* remove v2 reference

* refactor

* left menu can scroll now

* The maintenance manager was not being initialized when no data directory was configured for persistent storage.

* saving config

* Update task_config_schema_templ.go

* enable/disable tasks

* protobuf encoded task configurations

* fix system settings

* use ui component

* remove logs

* interface{} Reduction

* reduce interface{}

* reduce interface{}

* avoid from/to map

* reduce interface{}

* refactor

* keep it DRY

* added logging

* debug messages

* debug level

* debug

* show the log caller line

* use configured task policy

* log level

* handle admin heartbeat response

* Update worker.go

* fix EC rack and dc count

* Report task status to admin server

* fix task logging, simplify interface checking, use erasure_coding constants

* factor in empty volume server during task planning

* volume.list adds disk id

* track disk id also

* fix locking scheduled and manual scanning

* add active topology

* simplify task detector

* ec task completed, but shards are not showing up

* implement ec in ec_typed.go

* adjust log level

* dedup

* implementing ec copying shards and only ecx files

* use disk id when distributing ec shards

🎯 Planning: ActiveTopology creates DestinationPlan with specific TargetDisk
📦 Task Creation: maintenance_integration.go creates ECDestination with DiskId
🚀 Task Execution: EC task passes DiskId in VolumeEcShardsCopyRequest
💾 Volume Server: Receives disk_id and stores shards on specific disk (vs.store.Locations[req.DiskId])
📂 File System: EC shards and metadata land in the exact disk directory planned

* Delete original volume from all locations

* clean up existing shard locations

* local encoding and distributing

* Update docker/admin_integration/EC-TESTING-README.md

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>

* check volume id range

* simplify

* fix tests

* fix types

* clean up logs and tests

---------

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-30 12:38:03 -07:00

250 lines
7 KiB
Go

package maintenance
import (
"testing"
"time"
"github.com/seaweedfs/seaweedfs/weed/worker/types"
)
func TestPendingOperations_ConflictDetection(t *testing.T) {
pendingOps := NewPendingOperations()
// Add a pending erasure coding operation on volume 123
op := &PendingOperation{
VolumeID: 123,
OperationType: OpTypeErasureCoding,
SourceNode: "node1",
TaskID: "task-001",
StartTime: time.Now(),
EstimatedSize: 1024 * 1024 * 1024, // 1GB
Collection: "test",
Status: "assigned",
}
pendingOps.AddOperation(op)
// Test conflict detection
if !pendingOps.HasPendingOperationOnVolume(123) {
t.Errorf("Expected volume 123 to have pending operation")
}
if !pendingOps.WouldConflictWithPending(123, OpTypeVacuum) {
t.Errorf("Expected conflict when trying to add vacuum operation on volume 123")
}
if pendingOps.HasPendingOperationOnVolume(124) {
t.Errorf("Expected volume 124 to have no pending operation")
}
if pendingOps.WouldConflictWithPending(124, OpTypeVacuum) {
t.Errorf("Expected no conflict for volume 124")
}
}
func TestPendingOperations_CapacityProjection(t *testing.T) {
pendingOps := NewPendingOperations()
// Add operation moving volume from node1 to node2
op1 := &PendingOperation{
VolumeID: 100,
OperationType: OpTypeVolumeMove,
SourceNode: "node1",
DestNode: "node2",
TaskID: "task-001",
StartTime: time.Now(),
EstimatedSize: 2 * 1024 * 1024 * 1024, // 2GB
Collection: "test",
Status: "in_progress",
}
// Add operation moving volume from node3 to node1
op2 := &PendingOperation{
VolumeID: 101,
OperationType: OpTypeVolumeMove,
SourceNode: "node3",
DestNode: "node1",
TaskID: "task-002",
StartTime: time.Now(),
EstimatedSize: 1 * 1024 * 1024 * 1024, // 1GB
Collection: "test",
Status: "assigned",
}
pendingOps.AddOperation(op1)
pendingOps.AddOperation(op2)
// Test capacity impact for node1
incoming, outgoing := pendingOps.GetPendingCapacityImpactForNode("node1")
expectedIncoming := uint64(1 * 1024 * 1024 * 1024) // 1GB incoming
expectedOutgoing := uint64(2 * 1024 * 1024 * 1024) // 2GB outgoing
if incoming != expectedIncoming {
t.Errorf("Expected incoming capacity %d, got %d", expectedIncoming, incoming)
}
if outgoing != expectedOutgoing {
t.Errorf("Expected outgoing capacity %d, got %d", expectedOutgoing, outgoing)
}
// Test projection for node1
currentUsed := uint64(10 * 1024 * 1024 * 1024) // 10GB current
totalCapacity := uint64(50 * 1024 * 1024 * 1024) // 50GB total
projection := pendingOps.GetNodeCapacityProjection("node1", currentUsed, totalCapacity)
expectedProjectedUsed := currentUsed + incoming - outgoing // 10 + 1 - 2 = 9GB
expectedProjectedFree := totalCapacity - expectedProjectedUsed // 50 - 9 = 41GB
if projection.ProjectedUsed != expectedProjectedUsed {
t.Errorf("Expected projected used %d, got %d", expectedProjectedUsed, projection.ProjectedUsed)
}
if projection.ProjectedFree != expectedProjectedFree {
t.Errorf("Expected projected free %d, got %d", expectedProjectedFree, projection.ProjectedFree)
}
}
func TestPendingOperations_VolumeFiltering(t *testing.T) {
pendingOps := NewPendingOperations()
// Create volume metrics
metrics := []*types.VolumeHealthMetrics{
{VolumeID: 100, Server: "node1"},
{VolumeID: 101, Server: "node2"},
{VolumeID: 102, Server: "node3"},
{VolumeID: 103, Server: "node1"},
}
// Add pending operations on volumes 101 and 103
op1 := &PendingOperation{
VolumeID: 101,
OperationType: OpTypeVacuum,
SourceNode: "node2",
TaskID: "task-001",
StartTime: time.Now(),
EstimatedSize: 1024 * 1024 * 1024,
Status: "in_progress",
}
op2 := &PendingOperation{
VolumeID: 103,
OperationType: OpTypeErasureCoding,
SourceNode: "node1",
TaskID: "task-002",
StartTime: time.Now(),
EstimatedSize: 2 * 1024 * 1024 * 1024,
Status: "assigned",
}
pendingOps.AddOperation(op1)
pendingOps.AddOperation(op2)
// Filter metrics
filtered := pendingOps.FilterVolumeMetricsExcludingPending(metrics)
// Should only have volumes 100 and 102 (101 and 103 are filtered out)
if len(filtered) != 2 {
t.Errorf("Expected 2 filtered metrics, got %d", len(filtered))
}
// Check that correct volumes remain
foundVolumes := make(map[uint32]bool)
for _, metric := range filtered {
foundVolumes[metric.VolumeID] = true
}
if !foundVolumes[100] || !foundVolumes[102] {
t.Errorf("Expected volumes 100 and 102 to remain after filtering")
}
if foundVolumes[101] || foundVolumes[103] {
t.Errorf("Expected volumes 101 and 103 to be filtered out")
}
}
func TestPendingOperations_OperationLifecycle(t *testing.T) {
pendingOps := NewPendingOperations()
// Add operation
op := &PendingOperation{
VolumeID: 200,
OperationType: OpTypeVolumeBalance,
SourceNode: "node1",
DestNode: "node2",
TaskID: "task-balance-001",
StartTime: time.Now(),
EstimatedSize: 1024 * 1024 * 1024,
Status: "assigned",
}
pendingOps.AddOperation(op)
// Check it exists
if !pendingOps.HasPendingOperationOnVolume(200) {
t.Errorf("Expected volume 200 to have pending operation")
}
// Update status
pendingOps.UpdateOperationStatus("task-balance-001", "in_progress")
retrievedOp := pendingOps.GetPendingOperationOnVolume(200)
if retrievedOp == nil {
t.Errorf("Expected to retrieve pending operation for volume 200")
} else if retrievedOp.Status != "in_progress" {
t.Errorf("Expected operation status to be 'in_progress', got '%s'", retrievedOp.Status)
}
// Complete operation
pendingOps.RemoveOperation("task-balance-001")
if pendingOps.HasPendingOperationOnVolume(200) {
t.Errorf("Expected volume 200 to have no pending operation after removal")
}
}
func TestPendingOperations_StaleCleanup(t *testing.T) {
pendingOps := NewPendingOperations()
// Add recent operation
recentOp := &PendingOperation{
VolumeID: 300,
OperationType: OpTypeVacuum,
SourceNode: "node1",
TaskID: "task-recent",
StartTime: time.Now(),
EstimatedSize: 1024 * 1024 * 1024,
Status: "in_progress",
}
// Add stale operation (24 hours ago)
staleOp := &PendingOperation{
VolumeID: 301,
OperationType: OpTypeErasureCoding,
SourceNode: "node2",
TaskID: "task-stale",
StartTime: time.Now().Add(-24 * time.Hour),
EstimatedSize: 2 * 1024 * 1024 * 1024,
Status: "in_progress",
}
pendingOps.AddOperation(recentOp)
pendingOps.AddOperation(staleOp)
// Clean up operations older than 1 hour
removedCount := pendingOps.CleanupStaleOperations(1 * time.Hour)
if removedCount != 1 {
t.Errorf("Expected to remove 1 stale operation, removed %d", removedCount)
}
// Recent operation should still exist
if !pendingOps.HasPendingOperationOnVolume(300) {
t.Errorf("Expected recent operation on volume 300 to still exist")
}
// Stale operation should be removed
if pendingOps.HasPendingOperationOnVolume(301) {
t.Errorf("Expected stale operation on volume 301 to be removed")
}
}