【.NET Orleans入門】第4回:クラスタリングと高可用性 - 本番環境でのOrleans運用
.NETOrleansクラスタリング高可用性Kubernetes監視
はじめに
これまでの記事で、Orleansの基本概念、ステート管理、ストリーミングについて学んできました。今回は、本番環境でOrleansを運用するために必要な「クラスタリングと高可用性」について解説します。
数百台のサーバーで構成される大規模クラスター、99.99%の可用性を実現する設計、無停止でのアップデート...これらの実現方法を、実践的な構成例とともに学んでいきます。
Orleansクラスターの基礎
クラスターメンバーシップ
// Azure環境でのクラスター構成
public class Program
{
public static async Task Main(string[] args)
{
var host = Host.CreateDefaultBuilder(args)
.UseOrleans((context, siloBuilder) =>
{
var configuration = context.Configuration;
siloBuilder
// Azure Table Storageを使用したメンバーシップ
.UseAzureStorageClustering(options =>
{
options.ConnectionString = configuration["Orleans:AzureStorage:ConnectionString"];
options.TableName = "OrleansClusterMembership";
})
// クラスター設定
.Configure<ClusterOptions>(options =>
{
options.ClusterId = configuration["Orleans:ClusterId"] ?? "production-cluster";
options.ServiceId = configuration["Orleans:ServiceId"] ?? "MyOrleansService";
})
// エンドポイント設定
.ConfigureEndpoints(
siloPort: 11111,
gatewayPort: 30000,
listenOnAnyHostAddress: true)
// ダッシュボード
.UseDashboard(options =>
{
options.Port = 8080;
options.HostSelf = true;
options.CounterUpdateIntervalMs = 1000;
});
})
.ConfigureServices(services =>
{
services.Configure<HostOptions>(options =>
{
options.ShutdownTimeout = TimeSpan.FromSeconds(30);
});
})
.Build();
await host.RunAsync();
}
}
様々なメンバーシッププロバイダー
// 1. 開発環境:ローカルホスト
siloBuilder.UseLocalhostClustering();
// 2. Azure Table Storage
siloBuilder.UseAzureStorageClustering(options =>
{
options.ConnectionString = azureConnectionString;
options.TableName = "OrleansCluster";
});
// 3. ADO.NET (SQL Server, PostgreSQL, MySQL)
siloBuilder.UseAdoNetClustering(options =>
{
options.Invariant = "System.Data.SqlClient";
options.ConnectionString = sqlConnectionString;
});
// 4. Apache ZooKeeper
siloBuilder.UseZooKeeperClustering(options =>
{
options.ConnectionString = "localhost:2181";
});
// 5. Consul
siloBuilder.UseConsulClustering(options =>
{
options.Address = new Uri("http://localhost:8500");
options.KvRootFolder = "orleans";
});
// 6. Kubernetes
siloBuilder.UseKubernetesClustering(options =>
{
options.Group = "orleans.io";
options.ApiVersion = "v1";
});
Kubernetesでの本番環境構築
Kubernetes用のSilo構成
// Program.cs - Kubernetes環境用
public class Program
{
public static async Task Main(string[] args)
{
var host = Host.CreateDefaultBuilder(args)
.UseOrleans((context, siloBuilder) =>
{
var configuration = context.Configuration;
var env = context.HostingEnvironment;
siloBuilder
// Kubernetesクラスタリング
.UseKubernetesClustering(options =>
{
// CustomResourceDefinition設定
options.Group = "orleans.io";
options.ApiVersion = "v1beta1";
options.PluralName = "silos";
options.ClusterId = configuration["Orleans:ClusterId"];
options.Namespace = configuration["NAMESPACE"] ?? "default";
options.CanCreateResources = true;
})
// Redisでの永続化
.AddRedisGrainStorage("redis-store", options =>
{
options.ConnectionString = configuration["Redis:ConnectionString"];
options.UseJson = true;
options.DatabaseNumber = 0;
})
// 分散トレーシング
.AddActivityPropagation()
.ConfigureServices(services =>
{
services.AddOpenTelemetry()
.WithTracing(builder =>
{
builder
.AddAspNetCoreInstrumentation()
.AddHttpClientInstrumentation()
.AddSource("Orleans.Runtime")
.AddSource("Orleans.Application")
.AddJaegerExporter(options =>
{
options.AgentHost = configuration["Jaeger:AgentHost"];
options.AgentPort = configuration.GetValue<int>("Jaeger:AgentPort");
});
});
});
// ヘルスチェック
siloBuilder.ConfigureServices(services =>
{
services.AddHealthChecks()
.AddCheck<SiloHealthCheck>("silo_health")
.AddCheck<GrainHealthCheck>("grain_health");
});
})
.Build();
await host.RunAsync();
}
}
Kubernetes マニフェスト
# orleans-crd.yaml - Custom Resource Definition
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: silos.orleans.io
spec:
group: orleans.io
versions:
- name: v1beta1
served: true
storage: true
schema:
openAPIV3Schema:
type: object
properties:
spec:
type: object
status:
type: object
scope: Namespaced
names:
plural: silos
singular: silo
kind: Silo
---
# orleans-deployment.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: orleans-silo
namespace: orleans-system
spec:
serviceName: orleans-silo
replicas: 5
selector:
matchLabels:
app: orleans-silo
template:
metadata:
labels:
app: orleans-silo
orleans-cluster-member: "true"
spec:
serviceAccountName: orleans-silo
containers:
- name: silo
image: myregistry.azurecr.io/orleans-silo:latest
ports:
- containerPort: 11111
name: silo
protocol: TCP
- containerPort: 30000
name: gateway
protocol: TCP
- containerPort: 8080
name: dashboard
protocol: TCP
env:
- name: ORLEANS_SERVICE_ID
value: "MyOrleansService"
- name: ORLEANS_CLUSTER_ID
value: "production-cluster"
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
resources:
requests:
memory: "2Gi"
cpu: "1000m"
limits:
memory: "4Gi"
cpu: "2000m"
livenessProbe:
httpGet:
path: /health/live
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /health/ready
port: 8080
initialDelaySeconds: 10
periodSeconds: 5
volumeMounts:
- name: config
mountPath: /app/config
readOnly: true
volumes:
- name: config
configMap:
name: orleans-config
---
# orleans-service.yaml
apiVersion: v1
kind: Service
metadata:
name: orleans-gateway
namespace: orleans-system
spec:
type: LoadBalancer
selector:
app: orleans-silo
ports:
- name: gateway
port: 30000
targetPort: 30000
protocol: TCP
---
# orleans-dashboard-service.yaml
apiVersion: v1
kind: Service
metadata:
name: orleans-dashboard
namespace: orleans-system
spec:
type: ClusterIP
selector:
app: orleans-silo
ports:
- name: dashboard
port: 8080
targetPort: 8080
---
# orleans-rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: orleans-silo
namespace: orleans-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: orleans-silo
rules:
- apiGroups: ["orleans.io"]
resources: ["silos"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: orleans-silo
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: orleans-silo
subjects:
- kind: ServiceAccount
name: orleans-silo
namespace: orleans-system
高可用性の実現
1. Grainの配置戦略
// カスタム配置戦略
[AttributeUsage(AttributeTargets.Class)]
public class PreferLocalPlacementAttribute : PlacementAttribute
{
public PreferLocalPlacementAttribute() :
base(new PreferLocalPlacementStrategy())
{
}
}
public class PreferLocalPlacementStrategy : PlacementStrategy
{
}
public class PreferLocalPlacementDirector : IPlacementDirector
{
public async Task<SiloAddress> OnAddActivation(
PlacementStrategy strategy,
PlacementTarget target,
IPlacementContext context)
{
var silos = context.GetCompatibleSilos(target).ToList();
// ローカルサイロを優先
var localSilo = silos.FirstOrDefault(s => s.Equals(context.LocalSilo));
if (localSilo != null)
return localSilo;
// 負荷分散を考慮
var leastLoaded = silos
.OrderBy(s => context.GetSiloStatistics(s)?.ActiveGrainCount ?? 0)
.FirstOrDefault();
return leastLoaded ?? silos[Random.Shared.Next(silos.Count)];
}
}
// Grainでの使用
[PreferLocalPlacement]
public class CacheGrain : Grain, ICacheGrain
{
// ローカルサイロに配置されやすくなる
}
2. 自動フェイルオーバー
// ヘルスチェックの実装
public class SiloHealthCheck : IHealthCheck
{
private readonly ISiloHost _siloHost;
private readonly IClusterMembershipService _membershipService;
public SiloHealthCheck(
ISiloHost siloHost,
IClusterMembershipService membershipService)
{
_siloHost = siloHost;
_membershipService = membershipService;
}
public async Task<HealthCheckResult> CheckHealthAsync(
HealthCheckContext context,
CancellationToken cancellationToken = default)
{
try
{
var status = await _siloHost.GetSiloStatusAsync();
if (status != SiloStatus.Active)
{
return HealthCheckResult.Unhealthy(
$"Silo status is {status}");
}
var membership = await _membershipService.GetClusterMembershipAsync();
var activeSilos = membership.Members.Count(m => m.Status == SiloStatus.Active);
if (activeSilos < 3)
{
return HealthCheckResult.Degraded(
$"Only {activeSilos} active silos in cluster");
}
return HealthCheckResult.Healthy(
$"Silo is healthy. {activeSilos} active silos in cluster");
}
catch (Exception ex)
{
return HealthCheckResult.Unhealthy(
"Health check failed",
ex);
}
}
}
// Grainレベルのフェイルオーバー処理
public class ResilientGrain : Grain, IResilientGrain
{
private readonly IPersistentState<MyState> _state;
protected override async Task OnActivateAsync()
{
await base.OnActivateAsync();
// 以前のサイロから移行された場合の処理
if (_state.State.LastActiveSilo != null &&
_state.State.LastActiveSilo != RuntimeIdentity)
{
await HandleFailoverAsync();
}
_state.State.LastActiveSilo = RuntimeIdentity;
await _state.WriteStateAsync();
}
private async Task HandleFailoverAsync()
{
// 未完了のトランザクションを確認
var pendingTransactions = _state.State.PendingTransactions
.Where(t => t.Status == TransactionStatus.Processing)
.ToList();
foreach (var transaction in pendingTransactions)
{
// トランザクションの状態を確認して適切に処理
await RecoverTransactionAsync(transaction);
}
// フェイルオーバーイベントを記録
await LogFailoverEventAsync();
}
}
3. ローリングアップデート
// グレースフルシャットダウンの実装
public class GracefulShutdownHostedService : IHostedService
{
private readonly ISiloHost _siloHost;
private readonly IGrainFactory _grainFactory;
private readonly ILogger<GracefulShutdownHostedService> _logger;
public async Task StopAsync(CancellationToken cancellationToken)
{
_logger.LogInformation("Starting graceful shutdown...");
// 新しいアクティベーションを停止
await _siloHost.StopAcceptingNewActivationsAsync();
// アクティブなGrainの移行を待つ
var timeout = TimeSpan.FromMinutes(5);
var stopwatch = Stopwatch.StartNew();
while (stopwatch.Elapsed < timeout)
{
var statistics = await _siloHost.GetRuntimeStatisticsAsync();
if (statistics.ActiveGrainCount == 0)
{
_logger.LogInformation("All grains have been deactivated");
break;
}
_logger.LogInformation(
$"Waiting for {statistics.ActiveGrainCount} grains to deactivate...");
await Task.Delay(TimeSpan.FromSeconds(5), cancellationToken);
}
// 完全にシャットダウン
await _siloHost.StopAsync(cancellationToken);
}
}
// Kubernetes用のローリングアップデート設定
/*
kubectl set image statefulset/orleans-silo \
silo=myregistry.azurecr.io/orleans-silo:v2.0 \
--namespace=orleans-system
# または、Helmを使用
helm upgrade orleans-cluster ./orleans-chart \
--set image.tag=v2.0 \
--namespace orleans-system \
--wait
*/
監視とオブザーバビリティ
1. カスタムメトリクスの実装
public class OrleansMetricsCollector : IHostedService
{
private readonly IMeterFactory _meterFactory;
private readonly IClusterMembershipService _membershipService;
private readonly ISiloStatusOracle _siloStatusOracle;
private Timer _timer;
// メトリクス
private readonly ObservableGauge<int> _activeSilosGauge;
private readonly ObservableGauge<long> _totalGrainsGauge;
private readonly Counter<long> _grainActivations;
private readonly Counter<long> _grainDeactivations;
private readonly Histogram<double> _grainCallDuration;
public OrleansMetricsCollector(
IMeterFactory meterFactory,
IClusterMembershipService membershipService,
ISiloStatusOracle siloStatusOracle)
{
_membershipService = membershipService;
_siloStatusOracle = siloStatusOracle;
var meter = meterFactory.Create("Orleans.Cluster");
_activeSilosGauge = meter.CreateObservableGauge<int>(
"orleans.cluster.active_silos",
() => GetActiveSilosCount(),
description: "Number of active silos in the cluster");
_totalGrainsGauge = meter.CreateObservableGauge<long>(
"orleans.cluster.total_grains",
() => GetTotalGrainCount(),
description: "Total number of active grains");
_grainActivations = meter.CreateCounter<long>(
"orleans.grain.activations",
description: "Number of grain activations");
_grainDeactivations = meter.CreateCounter<long>(
"orleans.grain.deactivations",
description: "Number of grain deactivations");
_grainCallDuration = meter.CreateHistogram<double>(
"orleans.grain.call_duration",
unit: "ms",
description: "Duration of grain method calls");
}
public Task StartAsync(CancellationToken cancellationToken)
{
_timer = new Timer(
CollectMetrics,
null,
TimeSpan.Zero,
TimeSpan.FromSeconds(30));
return Task.CompletedTask;
}
private async void CollectMetrics(object state)
{
try
{
// クラスター統計を収集
var membership = await _membershipService.GetClusterMembershipAsync();
var statistics = await _siloStatusOracle.GetSimpleGrainStatisticsAsync();
// カスタムメトリクスを更新
foreach (var stat in statistics)
{
if (stat.GrainType.Contains("Orleans.Streams"))
{
// ストリーム関連のメトリクス
RecordStreamMetrics(stat);
}
}
}
catch (Exception ex)
{
// エラーをログ
}
}
}
2. 分散トレーシング
// Grain呼び出しのトレーシング
public class TracingGrainCallFilter : IIncomingGrainCallFilter
{
private readonly ActivitySource _activitySource;
public TracingGrainCallFilter()
{
_activitySource = new ActivitySource("Orleans.Application");
}
public async Task Invoke(IIncomingGrainCallContext context)
{
var grainType = context.Grain.GetType().Name;
var methodName = context.InterfaceMethod.Name;
using var activity = _activitySource.StartActivity(
$"{grainType}.{methodName}",
ActivityKind.Server);
if (activity != null)
{
activity.SetTag("grain.type", grainType);
activity.SetTag("grain.method", methodName);
activity.SetTag("grain.id", context.Grain.GetPrimaryKeyString());
try
{
await context.Invoke();
}
catch (Exception ex)
{
activity.SetStatus(ActivityStatusCode.Error, ex.Message);
activity.RecordException(ex);
throw;
}
}
else
{
await context.Invoke();
}
}
}
3. アラートルール
# prometheus-alerts.yaml
groups:
- name: orleans_alerts
interval: 30s
rules:
- alert: OrleansSiloDown
expr: orleans_cluster_active_silos < 3
for: 2m
labels:
severity: critical
annotations:
summary: "Orleans cluster has less than 3 active silos"
description: "Only {{ $value }} silos are active"
- alert: OrleansHighGrainActivationRate
expr: rate(orleans_grain_activations[5m]) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High grain activation rate"
description: "Activation rate is {{ $value }} per second"
- alert: OrleansGrainCallLatency
expr: |
histogram_quantile(0.99,
rate(orleans_grain_call_duration_bucket[5m])
) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High grain call latency"
description: "P99 latency is {{ $value }}ms"
ディザスタリカバリー
マルチリージョン構成
// マルチクラスター構成
public class MultiClusterConfiguration
{
public static ISiloBuilder ConfigureMultiCluster(
ISiloBuilder siloBuilder,
IConfiguration configuration)
{
return siloBuilder
.UseAzureStorageClustering(options =>
{
options.ConnectionString = configuration["Orleans:AzureStorage:ConnectionString"];
options.TableName = "OrleansClusterMembership";
})
.UseMultiCluster(options =>
{
options.ClusterId = configuration["Orleans:ClusterId"];
options.HasMultiClusterNetwork = true;
options.DefaultMultiCluster = new[] { "us-east", "eu-west", "asia-pacific" };
options.MaxMultiClusterGateways = 10;
options.BackgroundGossipInterval = TimeSpan.FromSeconds(30);
})
.UseAzureTableGossipChannel(options =>
{
options.ConnectionString = configuration["Orleans:AzureStorage:ConnectionString"];
options.TableName = "OrleansGossipChannel";
})
.ConfigureMultiClusterService(options =>
{
options.ResponseTimeout = TimeSpan.FromSeconds(30);
options.MaxSendAttempts = 3;
options.MaxSendRetryDelay = TimeSpan.FromSeconds(5);
});
}
}
// リージョン間のGrain通信
public class GlobalUserGrain : Grain, IGlobalUserGrain
{
public async Task<UserProfile> GetUserProfileAsync(string userId)
{
// ローカルリージョンをまず確認
var localGrain = GrainFactory.GetGrain<IUserGrain>(userId);
try
{
return await localGrain.GetProfileAsync();
}
catch (OrleansException)
{
// 他のリージョンにフォールバック
var clusters = new[] { "us-east", "eu-west", "asia-pacific" };
foreach (var cluster in clusters)
{
try
{
var remoteGrain = GrainFactory.GetGrain<IUserGrain>(
userId,
cluster);
return await remoteGrain.GetProfileAsync();
}
catch
{
// 次のリージョンを試す
}
}
throw new Exception("User not found in any region");
}
}
}
パフォーマンスチューニング
Siloの最適化
// 高パフォーマンス設定
siloBuilder
.Configure<SiloOptions>(options =>
{
options.DefaultConnectionLimit = 1000;
options.ServicePointOptions = ServicePointOptions.Default with
{
ConnectionLimit = 1000,
DefaultConnectionLimit = 1000,
Expect100Continue = false,
UseNagleAlgorithm = false
};
})
.Configure<LoadSheddingOptions>(options =>
{
options.LoadSheddingEnabled = true;
options.LoadSheddingLimit = 95;
})
.Configure<SchedulingOptions>(options =>
{
options.MaxActiveThreads = Environment.ProcessorCount * 4;
options.DelayWarningThreshold = TimeSpan.FromSeconds(10);
options.ActivationSchedulingQuantum = TimeSpan.FromMilliseconds(100);
options.TurnWarningLengthThreshold = TimeSpan.FromSeconds(5);
})
.Configure<MessagingOptions>(options =>
{
options.ResponseTimeout = TimeSpan.FromSeconds(30);
options.MaxResendCount = 3;
options.ResendOnTimeout = true;
options.DropExpiredMessages = true;
options.BufferPoolBufferSize = 4 * 1024;
options.BufferPoolMaxSize = 10000;
options.BufferPoolPreallocationSize = 1000;
})
.Configure<NetworkingOptions>(options =>
{
options.OpenConnectionTimeout = TimeSpan.FromSeconds(10);
options.MaxSocketAge = TimeSpan.FromHours(1);
});
まとめ
今回は、.NET Orleansのクラスタリングと高可用性について学びました。重要なポイント:
- 柔軟なクラスタリング: 様々な環境(Kubernetes、Azure、オンプレミス)に対応
- 自動フェイルオーバー: Grainの自動移行と状態の保持
- 無停止アップデート: ローリングアップデートによるサービス継続性
- 包括的な監視: メトリクス、トレーシング、アラートの統合
- ディザスタリカバリー: マルチリージョン構成での冗長性
次回は、実践的なパフォーマンスチューニングとトラブルシューティングについて解説します。
次回予告:「第5回:パフォーマンスチューニングとトラブルシューティング」では、Orleansアプリケーションの性能を最大化し、よくある問題を解決する実践的なテクニックを解説します。