test-rdma.sh 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. #!/bin/bash
  2. # Test RDMA functionality in simulation environment
  3. # This script validates that RDMA devices and libraries are working
  4. set -e
  5. echo "🧪 Testing RDMA simulation environment..."
  6. # Colors for output
  7. RED='\033[0;31m'
  8. GREEN='\033[0;32m'
  9. YELLOW='\033[1;33m'
  10. BLUE='\033[0;34m'
  11. NC='\033[0m' # No Color
  12. # Function to print colored output
  13. print_status() {
  14. local status="$1"
  15. local message="$2"
  16. case "$status" in
  17. "success")
  18. echo -e "${GREEN}✅ $message${NC}"
  19. ;;
  20. "warning")
  21. echo -e "${YELLOW}⚠️ $message${NC}"
  22. ;;
  23. "error")
  24. echo -e "${RED}❌ $message${NC}"
  25. ;;
  26. "info")
  27. echo -e "${BLUE}📋 $message${NC}"
  28. ;;
  29. esac
  30. }
  31. # Function to test RDMA devices
  32. test_rdma_devices() {
  33. print_status "info" "Testing RDMA devices..."
  34. # Check for InfiniBand/RDMA devices
  35. if [ -d /sys/class/infiniband ]; then
  36. local device_count=$(ls /sys/class/infiniband/ 2>/dev/null | wc -l)
  37. if [ "$device_count" -gt 0 ]; then
  38. print_status "success" "Found $device_count RDMA device(s)"
  39. # List devices
  40. for device in /sys/class/infiniband/*; do
  41. if [ -d "$device" ]; then
  42. local dev_name=$(basename "$device")
  43. print_status "info" "Device: $dev_name"
  44. fi
  45. done
  46. return 0
  47. else
  48. print_status "error" "No RDMA devices found"
  49. return 1
  50. fi
  51. else
  52. print_status "error" "/sys/class/infiniband directory not found"
  53. return 1
  54. fi
  55. }
  56. # Function to test libibverbs
  57. test_libibverbs() {
  58. print_status "info" "Testing libibverbs..."
  59. if command -v ibv_devinfo >/dev/null 2>&1; then
  60. # Get device info
  61. local device_info=$(ibv_devinfo 2>/dev/null)
  62. if [ -n "$device_info" ]; then
  63. print_status "success" "libibverbs working - devices detected"
  64. # Show basic info
  65. echo "$device_info" | head -5
  66. # Test device capabilities
  67. if echo "$device_info" | grep -q "transport.*InfiniBand\|transport.*Ethernet"; then
  68. print_status "success" "RDMA transport layer detected"
  69. else
  70. print_status "warning" "Transport layer information unclear"
  71. fi
  72. return 0
  73. else
  74. print_status "error" "ibv_devinfo found no devices"
  75. return 1
  76. fi
  77. else
  78. print_status "error" "ibv_devinfo command not found"
  79. return 1
  80. fi
  81. }
  82. # Function to test UCX
  83. test_ucx() {
  84. print_status "info" "Testing UCX..."
  85. if command -v ucx_info >/dev/null 2>&1; then
  86. # Test UCX device detection
  87. local ucx_output=$(ucx_info -d 2>/dev/null)
  88. if [ -n "$ucx_output" ]; then
  89. print_status "success" "UCX detecting devices"
  90. # Show UCX device info
  91. echo "$ucx_output" | head -10
  92. # Check for RDMA transports
  93. if echo "$ucx_output" | grep -q "rc\|ud\|dc"; then
  94. print_status "success" "UCX RDMA transports available"
  95. else
  96. print_status "warning" "UCX RDMA transports not detected"
  97. fi
  98. return 0
  99. else
  100. print_status "warning" "UCX not detecting devices"
  101. return 1
  102. fi
  103. else
  104. print_status "warning" "UCX tools not available"
  105. return 1
  106. fi
  107. }
  108. # Function to test RDMA CM (Connection Manager)
  109. test_rdma_cm() {
  110. print_status "info" "Testing RDMA Connection Manager..."
  111. # Check for RDMA CM device
  112. if [ -e /dev/infiniband/rdma_cm ]; then
  113. print_status "success" "RDMA CM device found"
  114. return 0
  115. else
  116. print_status "warning" "RDMA CM device not found"
  117. return 1
  118. fi
  119. }
  120. # Function to test basic RDMA operations
  121. test_rdma_operations() {
  122. print_status "info" "Testing basic RDMA operations..."
  123. # Try to run a simple RDMA test if tools are available
  124. if command -v ibv_rc_pingpong >/dev/null 2>&1; then
  125. # This would need a client/server setup, so just check if binary exists
  126. print_status "success" "RDMA test tools available (ibv_rc_pingpong)"
  127. else
  128. print_status "warning" "RDMA test tools not available"
  129. fi
  130. # Check for other useful RDMA utilities
  131. local tools_found=0
  132. for tool in ibv_asyncwatch ibv_read_lat ibv_write_lat; do
  133. if command -v "$tool" >/dev/null 2>&1; then
  134. tools_found=$((tools_found + 1))
  135. fi
  136. done
  137. if [ "$tools_found" -gt 0 ]; then
  138. print_status "success" "Found $tools_found additional RDMA test tools"
  139. else
  140. print_status "warning" "No additional RDMA test tools found"
  141. fi
  142. }
  143. # Function to generate test summary
  144. generate_summary() {
  145. echo ""
  146. print_status "info" "RDMA Simulation Test Summary"
  147. echo "======================================"
  148. # Re-run key tests for summary
  149. local devices_ok=0
  150. local libibverbs_ok=0
  151. local ucx_ok=0
  152. if [ -d /sys/class/infiniband ] && [ "$(ls /sys/class/infiniband/ 2>/dev/null | wc -l)" -gt 0 ]; then
  153. devices_ok=1
  154. fi
  155. if command -v ibv_devinfo >/dev/null 2>&1 && ibv_devinfo >/dev/null 2>&1; then
  156. libibverbs_ok=1
  157. fi
  158. if command -v ucx_info >/dev/null 2>&1 && ucx_info -d >/dev/null 2>&1; then
  159. ucx_ok=1
  160. fi
  161. echo "📊 Test Results:"
  162. [ "$devices_ok" -eq 1 ] && print_status "success" "RDMA Devices: PASS" || print_status "error" "RDMA Devices: FAIL"
  163. [ "$libibverbs_ok" -eq 1 ] && print_status "success" "libibverbs: PASS" || print_status "error" "libibverbs: FAIL"
  164. [ "$ucx_ok" -eq 1 ] && print_status "success" "UCX: PASS" || print_status "warning" "UCX: FAIL/WARNING"
  165. echo ""
  166. if [ "$devices_ok" -eq 1 ] && [ "$libibverbs_ok" -eq 1 ]; then
  167. print_status "success" "RDMA simulation environment is ready! 🎉"
  168. echo ""
  169. print_status "info" "You can now:"
  170. echo " - Run RDMA applications"
  171. echo " - Test SeaweedFS RDMA engine with real RDMA"
  172. echo " - Use UCX for high-performance transfers"
  173. return 0
  174. else
  175. print_status "error" "RDMA simulation setup needs attention"
  176. echo ""
  177. print_status "info" "Troubleshooting:"
  178. echo " - Run setup script: sudo /opt/rdma-sim/setup-soft-roce.sh"
  179. echo " - Check container privileges (--privileged flag)"
  180. echo " - Verify kernel RDMA support"
  181. return 1
  182. fi
  183. }
  184. # Main test execution
  185. main() {
  186. echo "🚀 RDMA Simulation Test Suite"
  187. echo "======================================"
  188. # Run tests
  189. test_rdma_devices || true
  190. echo ""
  191. test_libibverbs || true
  192. echo ""
  193. test_ucx || true
  194. echo ""
  195. test_rdma_cm || true
  196. echo ""
  197. test_rdma_operations || true
  198. echo ""
  199. # Generate summary
  200. generate_summary
  201. }
  202. # Health check mode (for Docker healthcheck)
  203. if [ "$1" = "healthcheck" ]; then
  204. # Quick health check - just verify devices exist
  205. if [ -d /sys/class/infiniband ] && [ "$(ls /sys/class/infiniband/ 2>/dev/null | wc -l)" -gt 0 ]; then
  206. exit 0
  207. else
  208. exit 1
  209. fi
  210. fi
  211. # Execute main function
  212. main "$@"