【.NET Orleans入門】第4回:クラスタリングと高可用性 - 本番環境でのOrleans運用

はじめに

これまでの記事で、Orleansの基本概念、ステート管理、ストリーミングについて学んできました。今回は、本番環境でOrleansを運用するために必要な「クラスタリングと高可用性」について解説します。

数百台のサーバーで構成される大規模クラスター、99.99%の可用性を実現する設計、無停止でのアップデート...これらの実現方法を、実践的な構成例とともに学んでいきます。

Orleansクラスターの基礎

クラスターメンバーシップ

// Azure環境でのクラスター構成
public class Program
{
    public static async Task Main(string[] args)
    {
        var host = Host.CreateDefaultBuilder(args)
            .UseOrleans((context, siloBuilder) =>
            {
                var configuration = context.Configuration;
                
                siloBuilder
                    // Azure Table Storageを使用したメンバーシップ
                    .UseAzureStorageClustering(options =>
                    {
                        options.ConnectionString = configuration["Orleans:AzureStorage:ConnectionString"];
                        options.TableName = "OrleansClusterMembership";
                    })
                    // クラスター設定
                    .Configure<ClusterOptions>(options =>
                    {
                        options.ClusterId = configuration["Orleans:ClusterId"] ?? "production-cluster";
                        options.ServiceId = configuration["Orleans:ServiceId"] ?? "MyOrleansService";
                    })
                    // エンドポイント設定
                    .ConfigureEndpoints(
                        siloPort: 11111,
                        gatewayPort: 30000,
                        listenOnAnyHostAddress: true)
                    // ダッシュボード
                    .UseDashboard(options =>
                    {
                        options.Port = 8080;
                        options.HostSelf = true;
                        options.CounterUpdateIntervalMs = 1000;
                    });
            })
            .ConfigureServices(services =>
            {
                services.Configure<HostOptions>(options =>
                {
                    options.ShutdownTimeout = TimeSpan.FromSeconds(30);
                });
            })
            .Build();

        await host.RunAsync();
    }
}

様々なメンバーシッププロバイダー

// 1. 開発環境:ローカルホスト
siloBuilder.UseLocalhostClustering();

// 2. Azure Table Storage
siloBuilder.UseAzureStorageClustering(options =>
{
    options.ConnectionString = azureConnectionString;
    options.TableName = "OrleansCluster";
});

// 3. ADO.NET (SQL Server, PostgreSQL, MySQL)
siloBuilder.UseAdoNetClustering(options =>
{
    options.Invariant = "System.Data.SqlClient";
    options.ConnectionString = sqlConnectionString;
});

// 4. Apache ZooKeeper
siloBuilder.UseZooKeeperClustering(options =>
{
    options.ConnectionString = "localhost:2181";
});

// 5. Consul
siloBuilder.UseConsulClustering(options =>
{
    options.Address = new Uri("http://localhost:8500");
    options.KvRootFolder = "orleans";
});

// 6. Kubernetes
siloBuilder.UseKubernetesClustering(options =>
{
    options.Group = "orleans.io";
    options.ApiVersion = "v1";
});

Kubernetesでの本番環境構築

Kubernetes用のSilo構成

// Program.cs - Kubernetes環境用
public class Program
{
    public static async Task Main(string[] args)
    {
        var host = Host.CreateDefaultBuilder(args)
            .UseOrleans((context, siloBuilder) =>
            {
                var configuration = context.Configuration;
                var env = context.HostingEnvironment;
                
                siloBuilder
                    // Kubernetesクラスタリング
                    .UseKubernetesClustering(options =>
                    {
                        // CustomResourceDefinition設定
                        options.Group = "orleans.io";
                        options.ApiVersion = "v1beta1";
                        options.PluralName = "silos";
                        options.ClusterId = configuration["Orleans:ClusterId"];
                        options.Namespace = configuration["NAMESPACE"] ?? "default";
                        options.CanCreateResources = true;
                    })
                    // Redisでの永続化
                    .AddRedisGrainStorage("redis-store", options =>
                    {
                        options.ConnectionString = configuration["Redis:ConnectionString"];
                        options.UseJson = true;
                        options.DatabaseNumber = 0;
                    })
                    // 分散トレーシング
                    .AddActivityPropagation()
                    .ConfigureServices(services =>
                    {
                        services.AddOpenTelemetry()
                            .WithTracing(builder =>
                            {
                                builder
                                    .AddAspNetCoreInstrumentation()
                                    .AddHttpClientInstrumentation()
                                    .AddSource("Orleans.Runtime")
                                    .AddSource("Orleans.Application")
                                    .AddJaegerExporter(options =>
                                    {
                                        options.AgentHost = configuration["Jaeger:AgentHost"];
                                        options.AgentPort = configuration.GetValue<int>("Jaeger:AgentPort");
                                    });
                            });
                    });
                    
                // ヘルスチェック
                siloBuilder.ConfigureServices(services =>
                {
                    services.AddHealthChecks()
                        .AddCheck<SiloHealthCheck>("silo_health")
                        .AddCheck<GrainHealthCheck>("grain_health");
                });
            })
            .Build();

        await host.RunAsync();
    }
}

Kubernetes マニフェスト

# orleans-crd.yaml - Custom Resource Definition
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
  name: silos.orleans.io
spec:
  group: orleans.io
  versions:
  - name: v1beta1
    served: true
    storage: true
    schema:
      openAPIV3Schema:
        type: object
        properties:
          spec:
            type: object
          status:
            type: object
  scope: Namespaced
  names:
    plural: silos
    singular: silo
    kind: Silo

---
# orleans-deployment.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: orleans-silo
  namespace: orleans-system
spec:
  serviceName: orleans-silo
  replicas: 5
  selector:
    matchLabels:
      app: orleans-silo
  template:
    metadata:
      labels:
        app: orleans-silo
        orleans-cluster-member: "true"
    spec:
      serviceAccountName: orleans-silo
      containers:
      - name: silo
        image: myregistry.azurecr.io/orleans-silo:latest
        ports:
        - containerPort: 11111
          name: silo
          protocol: TCP
        - containerPort: 30000
          name: gateway
          protocol: TCP
        - containerPort: 8080
          name: dashboard
          protocol: TCP
        env:
        - name: ORLEANS_SERVICE_ID
          value: "MyOrleansService"
        - name: ORLEANS_CLUSTER_ID
          value: "production-cluster"
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: POD_IP
          valueFrom:
            fieldRef:
              fieldPath: status.podIP
        resources:
          requests:
            memory: "2Gi"
            cpu: "1000m"
          limits:
            memory: "4Gi"
            cpu: "2000m"
        livenessProbe:
          httpGet:
            path: /health/live
            port: 8080
          initialDelaySeconds: 30
          periodSeconds: 10
        readinessProbe:
          httpGet:
            path: /health/ready
            port: 8080
          initialDelaySeconds: 10
          periodSeconds: 5
        volumeMounts:
        - name: config
          mountPath: /app/config
          readOnly: true
      volumes:
      - name: config
        configMap:
          name: orleans-config

---
# orleans-service.yaml
apiVersion: v1
kind: Service
metadata:
  name: orleans-gateway
  namespace: orleans-system
spec:
  type: LoadBalancer
  selector:
    app: orleans-silo
  ports:
  - name: gateway
    port: 30000
    targetPort: 30000
    protocol: TCP

---
# orleans-dashboard-service.yaml
apiVersion: v1
kind: Service
metadata:
  name: orleans-dashboard
  namespace: orleans-system
spec:
  type: ClusterIP
  selector:
    app: orleans-silo
  ports:
  - name: dashboard
    port: 8080
    targetPort: 8080

---
# orleans-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: orleans-silo
  namespace: orleans-system

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: orleans-silo
rules:
- apiGroups: ["orleans.io"]
  resources: ["silos"]
  verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]

---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: orleans-silo
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: orleans-silo
subjects:
- kind: ServiceAccount
  name: orleans-silo
  namespace: orleans-system

高可用性の実現

1. Grainの配置戦略

// カスタム配置戦略
[AttributeUsage(AttributeTargets.Class)]
public class PreferLocalPlacementAttribute : PlacementAttribute
{
    public PreferLocalPlacementAttribute() : 
        base(new PreferLocalPlacementStrategy())
    {
    }
}

public class PreferLocalPlacementStrategy : PlacementStrategy
{
}

public class PreferLocalPlacementDirector : IPlacementDirector
{
    public async Task<SiloAddress> OnAddActivation(
        PlacementStrategy strategy,
        PlacementTarget target,
        IPlacementContext context)
    {
        var silos = context.GetCompatibleSilos(target).ToList();
        
        // ローカルサイロを優先
        var localSilo = silos.FirstOrDefault(s => s.Equals(context.LocalSilo));
        if (localSilo != null)
            return localSilo;
        
        // 負荷分散を考慮
        var leastLoaded = silos
            .OrderBy(s => context.GetSiloStatistics(s)?.ActiveGrainCount ?? 0)
            .FirstOrDefault();
            
        return leastLoaded ?? silos[Random.Shared.Next(silos.Count)];
    }
}

// Grainでの使用
[PreferLocalPlacement]
public class CacheGrain : Grain, ICacheGrain
{
    // ローカルサイロに配置されやすくなる
}

2. 自動フェイルオーバー

// ヘルスチェックの実装
public class SiloHealthCheck : IHealthCheck
{
    private readonly ISiloHost _siloHost;
    private readonly IClusterMembershipService _membershipService;
    
    public SiloHealthCheck(
        ISiloHost siloHost,
        IClusterMembershipService membershipService)
    {
        _siloHost = siloHost;
        _membershipService = membershipService;
    }
    
    public async Task<HealthCheckResult> CheckHealthAsync(
        HealthCheckContext context,
        CancellationToken cancellationToken = default)
    {
        try
        {
            var status = await _siloHost.GetSiloStatusAsync();
            
            if (status != SiloStatus.Active)
            {
                return HealthCheckResult.Unhealthy(
                    $"Silo status is {status}");
            }
            
            var membership = await _membershipService.GetClusterMembershipAsync();
            var activeSilos = membership.Members.Count(m => m.Status == SiloStatus.Active);
            
            if (activeSilos < 3)
            {
                return HealthCheckResult.Degraded(
                    $"Only {activeSilos} active silos in cluster");
            }
            
            return HealthCheckResult.Healthy(
                $"Silo is healthy. {activeSilos} active silos in cluster");
        }
        catch (Exception ex)
        {
            return HealthCheckResult.Unhealthy(
                "Health check failed",
                ex);
        }
    }
}

// Grainレベルのフェイルオーバー処理
public class ResilientGrain : Grain, IResilientGrain
{
    private readonly IPersistentState<MyState> _state;
    
    protected override async Task OnActivateAsync()
    {
        await base.OnActivateAsync();
        
        // 以前のサイロから移行された場合の処理
        if (_state.State.LastActiveSilo != null && 
            _state.State.LastActiveSilo != RuntimeIdentity)
        {
            await HandleFailoverAsync();
        }
        
        _state.State.LastActiveSilo = RuntimeIdentity;
        await _state.WriteStateAsync();
    }
    
    private async Task HandleFailoverAsync()
    {
        // 未完了のトランザクションを確認
        var pendingTransactions = _state.State.PendingTransactions
            .Where(t => t.Status == TransactionStatus.Processing)
            .ToList();
            
        foreach (var transaction in pendingTransactions)
        {
            // トランザクションの状態を確認して適切に処理
            await RecoverTransactionAsync(transaction);
        }
        
        // フェイルオーバーイベントを記録
        await LogFailoverEventAsync();
    }
}

3. ローリングアップデート

// グレースフルシャットダウンの実装
public class GracefulShutdownHostedService : IHostedService
{
    private readonly ISiloHost _siloHost;
    private readonly IGrainFactory _grainFactory;
    private readonly ILogger<GracefulShutdownHostedService> _logger;
    
    public async Task StopAsync(CancellationToken cancellationToken)
    {
        _logger.LogInformation("Starting graceful shutdown...");
        
        // 新しいアクティベーションを停止
        await _siloHost.StopAcceptingNewActivationsAsync();
        
        // アクティブなGrainの移行を待つ
        var timeout = TimeSpan.FromMinutes(5);
        var stopwatch = Stopwatch.StartNew();
        
        while (stopwatch.Elapsed < timeout)
        {
            var statistics = await _siloHost.GetRuntimeStatisticsAsync();
            if (statistics.ActiveGrainCount == 0)
            {
                _logger.LogInformation("All grains have been deactivated");
                break;
            }
            
            _logger.LogInformation(
                $"Waiting for {statistics.ActiveGrainCount} grains to deactivate...");
            
            await Task.Delay(TimeSpan.FromSeconds(5), cancellationToken);
        }
        
        // 完全にシャットダウン
        await _siloHost.StopAsync(cancellationToken);
    }
}

// Kubernetes用のローリングアップデート設定
/*
kubectl set image statefulset/orleans-silo \
    silo=myregistry.azurecr.io/orleans-silo:v2.0 \
    --namespace=orleans-system

# または、Helmを使用
helm upgrade orleans-cluster ./orleans-chart \
    --set image.tag=v2.0 \
    --namespace orleans-system \
    --wait
*/

監視とオブザーバビリティ

1. カスタムメトリクスの実装

public class OrleansMetricsCollector : IHostedService
{
    private readonly IMeterFactory _meterFactory;
    private readonly IClusterMembershipService _membershipService;
    private readonly ISiloStatusOracle _siloStatusOracle;
    private Timer _timer;
    
    // メトリクス
    private readonly ObservableGauge<int> _activeSilosGauge;
    private readonly ObservableGauge<long> _totalGrainsGauge;
    private readonly Counter<long> _grainActivations;
    private readonly Counter<long> _grainDeactivations;
    private readonly Histogram<double> _grainCallDuration;
    
    public OrleansMetricsCollector(
        IMeterFactory meterFactory,
        IClusterMembershipService membershipService,
        ISiloStatusOracle siloStatusOracle)
    {
        _membershipService = membershipService;
        _siloStatusOracle = siloStatusOracle;
        
        var meter = meterFactory.Create("Orleans.Cluster");
        
        _activeSilosGauge = meter.CreateObservableGauge<int>(
            "orleans.cluster.active_silos",
            () => GetActiveSilosCount(),
            description: "Number of active silos in the cluster");
            
        _totalGrainsGauge = meter.CreateObservableGauge<long>(
            "orleans.cluster.total_grains",
            () => GetTotalGrainCount(),
            description: "Total number of active grains");
            
        _grainActivations = meter.CreateCounter<long>(
            "orleans.grain.activations",
            description: "Number of grain activations");
            
        _grainDeactivations = meter.CreateCounter<long>(
            "orleans.grain.deactivations",
            description: "Number of grain deactivations");
            
        _grainCallDuration = meter.CreateHistogram<double>(
            "orleans.grain.call_duration",
            unit: "ms",
            description: "Duration of grain method calls");
    }
    
    public Task StartAsync(CancellationToken cancellationToken)
    {
        _timer = new Timer(
            CollectMetrics,
            null,
            TimeSpan.Zero,
            TimeSpan.FromSeconds(30));
            
        return Task.CompletedTask;
    }
    
    private async void CollectMetrics(object state)
    {
        try
        {
            // クラスター統計を収集
            var membership = await _membershipService.GetClusterMembershipAsync();
            var statistics = await _siloStatusOracle.GetSimpleGrainStatisticsAsync();
            
            // カスタムメトリクスを更新
            foreach (var stat in statistics)
            {
                if (stat.GrainType.Contains("Orleans.Streams"))
                {
                    // ストリーム関連のメトリクス
                    RecordStreamMetrics(stat);
                }
            }
        }
        catch (Exception ex)
        {
            // エラーをログ
        }
    }
}

2. 分散トレーシング

// Grain呼び出しのトレーシング
public class TracingGrainCallFilter : IIncomingGrainCallFilter
{
    private readonly ActivitySource _activitySource;
    
    public TracingGrainCallFilter()
    {
        _activitySource = new ActivitySource("Orleans.Application");
    }
    
    public async Task Invoke(IIncomingGrainCallContext context)
    {
        var grainType = context.Grain.GetType().Name;
        var methodName = context.InterfaceMethod.Name;
        
        using var activity = _activitySource.StartActivity(
            $"{grainType}.{methodName}",
            ActivityKind.Server);
            
        if (activity != null)
        {
            activity.SetTag("grain.type", grainType);
            activity.SetTag("grain.method", methodName);
            activity.SetTag("grain.id", context.Grain.GetPrimaryKeyString());
            
            try
            {
                await context.Invoke();
            }
            catch (Exception ex)
            {
                activity.SetStatus(ActivityStatusCode.Error, ex.Message);
                activity.RecordException(ex);
                throw;
            }
        }
        else
        {
            await context.Invoke();
        }
    }
}

3. アラートルール

# prometheus-alerts.yaml
groups:
  - name: orleans_alerts
    interval: 30s
    rules:
      - alert: OrleansSiloDown
        expr: orleans_cluster_active_silos < 3
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Orleans cluster has less than 3 active silos"
          description: "Only {{ $value }} silos are active"
      
      - alert: OrleansHighGrainActivationRate
        expr: rate(orleans_grain_activations[5m]) > 1000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High grain activation rate"
          description: "Activation rate is {{ $value }} per second"
      
      - alert: OrleansGrainCallLatency
        expr: |
          histogram_quantile(0.99, 
            rate(orleans_grain_call_duration_bucket[5m])
          ) > 1000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High grain call latency"
          description: "P99 latency is {{ $value }}ms"

ディザスタリカバリー

マルチリージョン構成

// マルチクラスター構成
public class MultiClusterConfiguration
{
    public static ISiloBuilder ConfigureMultiCluster(
        ISiloBuilder siloBuilder,
        IConfiguration configuration)
    {
        return siloBuilder
            .UseAzureStorageClustering(options =>
            {
                options.ConnectionString = configuration["Orleans:AzureStorage:ConnectionString"];
                options.TableName = "OrleansClusterMembership";
            })
            .UseMultiCluster(options =>
            {
                options.ClusterId = configuration["Orleans:ClusterId"];
                options.HasMultiClusterNetwork = true;
                options.DefaultMultiCluster = new[] { "us-east", "eu-west", "asia-pacific" };
                options.MaxMultiClusterGateways = 10;
                options.BackgroundGossipInterval = TimeSpan.FromSeconds(30);
            })
            .UseAzureTableGossipChannel(options =>
            {
                options.ConnectionString = configuration["Orleans:AzureStorage:ConnectionString"];
                options.TableName = "OrleansGossipChannel";
            })
            .ConfigureMultiClusterService(options =>
            {
                options.ResponseTimeout = TimeSpan.FromSeconds(30);
                options.MaxSendAttempts = 3;
                options.MaxSendRetryDelay = TimeSpan.FromSeconds(5);
            });
    }
}

// リージョン間のGrain通信
public class GlobalUserGrain : Grain, IGlobalUserGrain
{
    public async Task<UserProfile> GetUserProfileAsync(string userId)
    {
        // ローカルリージョンをまず確認
        var localGrain = GrainFactory.GetGrain<IUserGrain>(userId);
        
        try
        {
            return await localGrain.GetProfileAsync();
        }
        catch (OrleansException)
        {
            // 他のリージョンにフォールバック
            var clusters = new[] { "us-east", "eu-west", "asia-pacific" };
            
            foreach (var cluster in clusters)
            {
                try
                {
                    var remoteGrain = GrainFactory.GetGrain<IUserGrain>(
                        userId, 
                        cluster);
                    
                    return await remoteGrain.GetProfileAsync();
                }
                catch
                {
                    // 次のリージョンを試す
                }
            }
            
            throw new Exception("User not found in any region");
        }
    }
}

パフォーマンスチューニング

Siloの最適化

// 高パフォーマンス設定
siloBuilder
    .Configure<SiloOptions>(options =>
    {
        options.DefaultConnectionLimit = 1000;
        options.ServicePointOptions = ServicePointOptions.Default with
        {
            ConnectionLimit = 1000,
            DefaultConnectionLimit = 1000,
            Expect100Continue = false,
            UseNagleAlgorithm = false
        };
    })
    .Configure<LoadSheddingOptions>(options =>
    {
        options.LoadSheddingEnabled = true;
        options.LoadSheddingLimit = 95;
    })
    .Configure<SchedulingOptions>(options =>
    {
        options.MaxActiveThreads = Environment.ProcessorCount * 4;
        options.DelayWarningThreshold = TimeSpan.FromSeconds(10);
        options.ActivationSchedulingQuantum = TimeSpan.FromMilliseconds(100);
        options.TurnWarningLengthThreshold = TimeSpan.FromSeconds(5);
    })
    .Configure<MessagingOptions>(options =>
    {
        options.ResponseTimeout = TimeSpan.FromSeconds(30);
        options.MaxResendCount = 3;
        options.ResendOnTimeout = true;
        options.DropExpiredMessages = true;
        options.BufferPoolBufferSize = 4 * 1024;
        options.BufferPoolMaxSize = 10000;
        options.BufferPoolPreallocationSize = 1000;
    })
    .Configure<NetworkingOptions>(options =>
    {
        options.OpenConnectionTimeout = TimeSpan.FromSeconds(10);
        options.MaxSocketAge = TimeSpan.FromHours(1);
    });

まとめ

今回は、.NET Orleansのクラスタリングと高可用性について学びました。重要なポイント:

  1. 柔軟なクラスタリング: 様々な環境(Kubernetes、Azure、オンプレミス)に対応
  2. 自動フェイルオーバー: Grainの自動移行と状態の保持
  3. 無停止アップデート: ローリングアップデートによるサービス継続性
  4. 包括的な監視: メトリクス、トレーシング、アラートの統合
  5. ディザスタリカバリー: マルチリージョン構成での冗長性

次回は、実践的なパフォーマンスチューニングとトラブルシューティングについて解説します。


次回予告:「第5回:パフォーマンスチューニングとトラブルシューティング」では、Orleansアプリケーションの性能を最大化し、よくある問題を解決する実践的なテクニックを解説します。

技術的な課題をお持ちですか専門チームがサポートします

記事でご紹介した技術や実装について、
より詳細なご相談やプロジェクトのサポートを承ります。