setup-soft-roce.sh 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. #!/bin/bash
  2. # Setup Soft-RoCE (RXE) for RDMA simulation
  3. # This script enables RDMA over Ethernet using the RXE kernel module
  4. set -e
  5. echo "🔧 Setting up Soft-RoCE (RXE) RDMA simulation..."
  6. # Function to check if running with required privileges
  7. check_privileges() {
  8. if [ "$EUID" -ne 0 ]; then
  9. echo "❌ This script requires root privileges"
  10. echo "Run with: sudo $0 or inside a privileged container"
  11. exit 1
  12. fi
  13. }
  14. # Function to load RXE kernel module
  15. load_rxe_module() {
  16. echo "📦 Loading RXE kernel module..."
  17. # Try to load the rdma_rxe module
  18. if modprobe rdma_rxe 2>/dev/null; then
  19. echo "✅ rdma_rxe module loaded successfully"
  20. else
  21. echo "⚠️ Failed to load rdma_rxe module, trying alternative approach..."
  22. # Alternative: Try loading rxe_net (older kernels)
  23. if modprobe rxe_net 2>/dev/null; then
  24. echo "✅ rxe_net module loaded successfully"
  25. else
  26. echo "❌ Failed to load RXE modules. Possible causes:"
  27. echo " - Kernel doesn't support RXE (needs CONFIG_RDMA_RXE=m)"
  28. echo " - Running in unprivileged container"
  29. echo " - Missing kernel modules"
  30. echo ""
  31. echo "🔧 Workaround: Run container with --privileged flag"
  32. exit 1
  33. fi
  34. fi
  35. # Verify module is loaded
  36. if lsmod | grep -q "rdma_rxe\|rxe_net"; then
  37. echo "✅ RXE module verification successful"
  38. else
  39. echo "❌ RXE module verification failed"
  40. exit 1
  41. fi
  42. }
  43. # Function to setup virtual RDMA device
  44. setup_rxe_device() {
  45. echo "🌐 Setting up RXE device over Ethernet interface..."
  46. # Find available network interface (prefer eth0, fallback to others)
  47. local interface=""
  48. for iface in eth0 enp0s3 enp0s8 lo; do
  49. if ip link show "$iface" >/dev/null 2>&1; then
  50. interface="$iface"
  51. break
  52. fi
  53. done
  54. if [ -z "$interface" ]; then
  55. echo "❌ No suitable network interface found"
  56. echo "Available interfaces:"
  57. ip link show | grep "^[0-9]" | cut -d':' -f2 | tr -d ' '
  58. exit 1
  59. fi
  60. echo "📡 Using network interface: $interface"
  61. # Create RXE device
  62. echo "🔨 Creating RXE device on $interface..."
  63. # Try modern rxe_cfg approach first
  64. if command -v rxe_cfg >/dev/null 2>&1; then
  65. rxe_cfg add "$interface" || {
  66. echo "⚠️ rxe_cfg failed, trying manual approach..."
  67. setup_rxe_manual "$interface"
  68. }
  69. else
  70. echo "⚠️ rxe_cfg not available, using manual setup..."
  71. setup_rxe_manual "$interface"
  72. fi
  73. }
  74. # Function to manually setup RXE device
  75. setup_rxe_manual() {
  76. local interface="$1"
  77. # Use sysfs interface to create RXE device
  78. if [ -d /sys/module/rdma_rxe ]; then
  79. echo "$interface" > /sys/module/rdma_rxe/parameters/add 2>/dev/null || {
  80. echo "❌ Failed to add RXE device via sysfs"
  81. exit 1
  82. }
  83. else
  84. echo "❌ RXE sysfs interface not found"
  85. exit 1
  86. fi
  87. }
  88. # Function to verify RDMA devices
  89. verify_rdma_devices() {
  90. echo "🔍 Verifying RDMA devices..."
  91. # Check for RDMA devices
  92. if [ -d /sys/class/infiniband ]; then
  93. local devices=$(ls /sys/class/infiniband/ 2>/dev/null | wc -l)
  94. if [ "$devices" -gt 0 ]; then
  95. echo "✅ Found $devices RDMA device(s):"
  96. ls /sys/class/infiniband/
  97. # Show device details
  98. for device in /sys/class/infiniband/*; do
  99. if [ -d "$device" ]; then
  100. local dev_name=$(basename "$device")
  101. echo " 📋 Device: $dev_name"
  102. # Try to get device info
  103. if command -v ibv_devinfo >/dev/null 2>&1; then
  104. ibv_devinfo -d "$dev_name" | head -10
  105. fi
  106. fi
  107. done
  108. else
  109. echo "❌ No RDMA devices found in /sys/class/infiniband/"
  110. exit 1
  111. fi
  112. else
  113. echo "❌ /sys/class/infiniband directory not found"
  114. exit 1
  115. fi
  116. }
  117. # Function to test basic RDMA functionality
  118. test_basic_rdma() {
  119. echo "🧪 Testing basic RDMA functionality..."
  120. # Test libibverbs
  121. if command -v ibv_devinfo >/dev/null 2>&1; then
  122. echo "📋 RDMA device information:"
  123. ibv_devinfo | head -20
  124. else
  125. echo "⚠️ ibv_devinfo not available"
  126. fi
  127. # Test UCX if available
  128. if command -v ucx_info >/dev/null 2>&1; then
  129. echo "📋 UCX information:"
  130. ucx_info -d | head -10
  131. else
  132. echo "⚠️ UCX tools not available"
  133. fi
  134. }
  135. # Main execution
  136. main() {
  137. echo "🚀 Starting Soft-RoCE RDMA simulation setup..."
  138. echo "======================================"
  139. check_privileges
  140. load_rxe_module
  141. setup_rxe_device
  142. verify_rdma_devices
  143. test_basic_rdma
  144. echo ""
  145. echo "🎉 Soft-RoCE setup completed successfully!"
  146. echo "======================================"
  147. echo "✅ RDMA simulation is ready for testing"
  148. echo "📡 You can now run RDMA applications"
  149. echo ""
  150. echo "Next steps:"
  151. echo " - Test with: /opt/rdma-sim/test-rdma.sh"
  152. echo " - Check UCX: /opt/rdma-sim/ucx-info.sh"
  153. echo " - Run your RDMA applications"
  154. }
  155. # Execute main function
  156. main "$@"