url-validator.js 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. // GrabZilla 2.1 - URL Validation Utilities
  2. // Comprehensive URL validation for video platforms
  3. class URLValidator {
  4. /**
  5. * Detect punycode/IDN domains that might be used for spoofing
  6. * @param {string} hostname - Domain hostname to check
  7. * @returns {boolean} True if punycode detected
  8. */
  9. static isPunycode(hostname) {
  10. if (!hostname || typeof hostname !== 'string') {
  11. return false;
  12. }
  13. // Punycode domains start with "xn--"
  14. return hostname.toLowerCase().includes('xn--');
  15. }
  16. /**
  17. * Detect potential homograph attacks using lookalike characters
  18. * @param {string} hostname - Domain hostname to check
  19. * @returns {object} Detection result with suspicious characters
  20. */
  21. static detectHomographAttack(hostname) {
  22. if (!hostname || typeof hostname !== 'string') {
  23. return { suspicious: false, characters: [] };
  24. }
  25. // Common homograph character mappings
  26. const suspiciousChars = [
  27. // Cyrillic lookalikes
  28. { char: 'а', lookalike: 'a', name: 'Cyrillic а' },
  29. { char: 'е', lookalike: 'e', name: 'Cyrillic е' },
  30. { char: 'о', lookalike: 'o', name: 'Cyrillic о' },
  31. { char: 'р', lookalike: 'p', name: 'Cyrillic р' },
  32. { char: 'с', lookalike: 'c', name: 'Cyrillic с' },
  33. { char: 'у', lookalike: 'y', name: 'Cyrillic у' },
  34. { char: 'х', lookalike: 'x', name: 'Cyrillic х' },
  35. // Greek lookalikes
  36. { char: 'ο', lookalike: 'o', name: 'Greek omicron' },
  37. { char: 'ν', lookalike: 'v', name: 'Greek nu' },
  38. { char: 'α', lookalike: 'a', name: 'Greek alpha' },
  39. // Other suspicious characters
  40. { char: 'і', lookalike: 'i', name: 'Ukrainian i' },
  41. { char: 'ј', lookalike: 'j', name: 'Cyrillic j' }
  42. ];
  43. const found = [];
  44. const lowerHostname = hostname.toLowerCase();
  45. for (const { char, lookalike, name } of suspiciousChars) {
  46. if (lowerHostname.includes(char)) {
  47. found.push({ char, lookalike, name });
  48. }
  49. }
  50. // Check for mixed scripts (ASCII + non-ASCII)
  51. const hasAscii = /[a-z]/i.test(hostname);
  52. const hasNonAscii = /[^\x00-\x7F]/.test(hostname);
  53. const mixedScripts = hasAscii && hasNonAscii;
  54. return {
  55. suspicious: found.length > 0 || mixedScripts,
  56. characters: found,
  57. mixedScripts
  58. };
  59. }
  60. /**
  61. * Validate URL against security threats (punycode, homographs)
  62. * @param {string} url - URL to validate
  63. * @returns {object} Validation result with security warnings
  64. */
  65. static validateUrlSecurity(url) {
  66. if (!url || typeof url !== 'string') {
  67. return { safe: false, warnings: ['Invalid URL'] };
  68. }
  69. try {
  70. const parsed = new URL(url.startsWith('http') ? url : 'https://' + url);
  71. const hostname = parsed.hostname.toLowerCase();
  72. const warnings = [];
  73. // Check for punycode
  74. if (this.isPunycode(hostname)) {
  75. warnings.push('⚠️ Punycode domain detected - may be used for spoofing');
  76. }
  77. // Check for homograph attacks
  78. const homograph = this.detectHomographAttack(hostname);
  79. if (homograph.suspicious) {
  80. if (homograph.characters.length > 0) {
  81. warnings.push(`⚠️ Suspicious lookalike characters detected: ${homograph.characters.map(c => c.name).join(', ')}`);
  82. }
  83. if (homograph.mixedScripts) {
  84. warnings.push('⚠️ Mixed character scripts detected - potential homograph attack');
  85. }
  86. }
  87. // Verify against trusted domains
  88. const trustedDomains = [
  89. 'youtube.com',
  90. 'youtu.be',
  91. 'vimeo.com'
  92. ];
  93. const isTrusted = trustedDomains.some(domain =>
  94. hostname === domain ||
  95. hostname.endsWith('.' + domain) ||
  96. hostname === 'www.' + domain
  97. );
  98. if (!isTrusted && warnings.length === 0) {
  99. warnings.push('⚠️ Domain not in trusted list');
  100. }
  101. return {
  102. safe: warnings.length === 0,
  103. warnings,
  104. hostname,
  105. isTrusted
  106. };
  107. } catch (error) {
  108. return {
  109. safe: false,
  110. warnings: ['Invalid URL format']
  111. };
  112. }
  113. }
  114. // Check if URL is a valid video URL from supported platforms
  115. static isValidVideoUrl(url) {
  116. if (!url || typeof url !== 'string') {
  117. return false;
  118. }
  119. const trimmedUrl = url.trim();
  120. if (trimmedUrl.length === 0) {
  121. return false;
  122. }
  123. // SECURITY: Check for punycode and homograph attacks
  124. const securityCheck = this.validateUrlSecurity(trimmedUrl);
  125. if (!securityCheck.safe) {
  126. // Log security warnings but still allow if domain is trusted
  127. if (!securityCheck.isTrusted) {
  128. console.warn('URL security warnings:', securityCheck.warnings);
  129. return false;
  130. }
  131. }
  132. // Check against supported platforms
  133. return this.isYouTubeUrl(trimmedUrl) ||
  134. this.isVimeoUrl(trimmedUrl) ||
  135. this.isGenericVideoUrl(trimmedUrl);
  136. }
  137. // Validate YouTube URLs (including Shorts)
  138. static isYouTubeUrl(url) {
  139. // Match YouTube URLs with any query parameters (including Shorts)
  140. const videoPattern = /^(https?:\/\/)?(www\.)?(youtube\.com\/(watch\?v=|embed\/|v\/|shorts\/)|youtu\.be\/)[\w\-_]{11}([?&].*)?$/i;
  141. const playlistPattern = /^(https?:\/\/)?(www\.)?youtube\.com\/playlist\?list=[\w\-]+/i;
  142. return videoPattern.test(url) || playlistPattern.test(url);
  143. }
  144. // Validate Vimeo URLs
  145. static isVimeoUrl(url) {
  146. const patterns = (typeof window !== 'undefined' && window.AppConfig?.VALIDATION_PATTERNS) || {
  147. VIMEO_URL: /^(https?:\/\/)?(www\.)?(vimeo\.com\/\d+|player\.vimeo\.com\/video\/\d+)/i
  148. };
  149. return patterns.VIMEO_URL.test(url);
  150. }
  151. // Check if URL is a YouTube playlist
  152. static isYouTubePlaylist(url) {
  153. if (!url || typeof url !== 'string') {
  154. return false;
  155. }
  156. return /[?&]list=[\w\-]+/.test(url);
  157. }
  158. // Check if URL is a YouTube Shorts video
  159. static isYouTubeShorts(url) {
  160. if (!url || typeof url !== 'string') {
  161. return false;
  162. }
  163. return /youtube\.com\/shorts\/[\w\-_]{11}/i.test(url);
  164. }
  165. // Validate generic video URLs
  166. static isGenericVideoUrl(url) {
  167. // Disable generic video URL validation to be more strict
  168. // Only allow explicitly supported platforms (YouTube, Vimeo)
  169. return false;
  170. }
  171. // Extract video ID from YouTube URL (including Shorts)
  172. static extractYouTubeId(url) {
  173. if (!this.isYouTubeUrl(url)) {
  174. return null;
  175. }
  176. const patterns = [
  177. /[?&]v=([^&#]*)/, // youtube.com/watch?v=ID
  178. /\/embed\/([^\/\?]*)/, // youtube.com/embed/ID
  179. /\/v\/([^\/\?]*)/, // youtube.com/v/ID
  180. /\/shorts\/([^\/\?]*)/, // youtube.com/shorts/ID
  181. /youtu\.be\/([^\/\?]*)/ // youtu.be/ID
  182. ];
  183. for (const pattern of patterns) {
  184. const match = url.match(pattern);
  185. if (match && match[1]) {
  186. return match[1];
  187. }
  188. }
  189. return null;
  190. }
  191. // Extract video ID from Vimeo URL
  192. static extractVimeoId(url) {
  193. if (!this.isVimeoUrl(url)) {
  194. return null;
  195. }
  196. const match = url.match(/vimeo\.com\/(\d+)/);
  197. return match ? match[1] : null;
  198. }
  199. // Normalize URL to standard format
  200. static normalizeUrl(url) {
  201. if (!url || typeof url !== 'string') {
  202. return url;
  203. }
  204. let normalizedUrl = url.trim();
  205. // Add protocol if missing
  206. if (!/^https?:\/\//i.test(normalizedUrl)) {
  207. normalizedUrl = 'https://' + normalizedUrl;
  208. }
  209. // Normalize YouTube URLs
  210. if (this.isYouTubeUrl(normalizedUrl)) {
  211. const videoId = this.extractYouTubeId(normalizedUrl);
  212. if (videoId) {
  213. return `https://www.youtube.com/watch?v=${videoId}`;
  214. }
  215. }
  216. // Normalize Vimeo URLs
  217. if (this.isVimeoUrl(normalizedUrl)) {
  218. const videoId = this.extractVimeoId(normalizedUrl);
  219. if (videoId) {
  220. return `https://vimeo.com/${videoId}`;
  221. }
  222. }
  223. return normalizedUrl;
  224. }
  225. // Get platform name from URL
  226. static getPlatform(url) {
  227. if (this.isYouTubeUrl(url)) {
  228. return 'YouTube';
  229. }
  230. if (this.isVimeoUrl(url)) {
  231. return 'Vimeo';
  232. }
  233. return 'Unknown';
  234. }
  235. // Validate multiple URLs (one per line)
  236. static validateMultipleUrls(urlText) {
  237. if (!urlText || typeof urlText !== 'string') {
  238. return { valid: [], invalid: [] };
  239. }
  240. // Extract all URLs from text using regex patterns
  241. // Match entire YouTube URLs including all query parameters (including Shorts)
  242. const youtubePattern = /(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:watch\?v=|embed\/|v\/|shorts\/)|youtu\.be\/)[\w\-_]{11}(?:[?&][^\s]*)*/gi;
  243. const vimeoPattern = /(?:https?:\/\/)?(?:www\.)?(?:vimeo\.com\/|player\.vimeo\.com\/video\/)\d+/gi;
  244. const youtubeMatches = urlText.match(youtubePattern) || [];
  245. const vimeoMatches = urlText.match(vimeoPattern) || [];
  246. const allUrls = [...youtubeMatches, ...vimeoMatches];
  247. const valid = [];
  248. const invalid = [];
  249. const seen = new Set();
  250. allUrls.forEach(url => {
  251. // Fully normalize URLs to canonical format for deduplication
  252. const normalizedUrl = this.normalizeUrl(url);
  253. // Deduplicate based on normalized canonical URL
  254. if (!seen.has(normalizedUrl)) {
  255. seen.add(normalizedUrl);
  256. if (this.isValidVideoUrl(normalizedUrl)) {
  257. valid.push(normalizedUrl);
  258. } else {
  259. invalid.push(url);
  260. }
  261. }
  262. });
  263. return { valid, invalid };
  264. }
  265. // Check for duplicate URLs in a list
  266. static findDuplicates(urls) {
  267. const normalized = urls.map(url => this.normalizeUrl(url));
  268. const duplicates = [];
  269. const seen = new Set();
  270. normalized.forEach((url, index) => {
  271. if (seen.has(url)) {
  272. duplicates.push({ url: urls[index], index });
  273. } else {
  274. seen.add(url);
  275. }
  276. });
  277. return duplicates;
  278. }
  279. // Get validation error message
  280. static getValidationError(url) {
  281. if (url === null || url === undefined) {
  282. return 'URL is required';
  283. }
  284. if (typeof url !== 'string' || url.trim().length === 0) {
  285. return 'URL cannot be empty';
  286. }
  287. const trimmedUrl = url.trim();
  288. if (!/^https?:\/\//i.test(trimmedUrl) && !/^www\./i.test(trimmedUrl) && !trimmedUrl.includes('.')) {
  289. return 'Invalid URL format - must include domain';
  290. }
  291. if (!this.isValidVideoUrl(trimmedUrl)) {
  292. return 'Unsupported video platform - currently supports YouTube and Vimeo';
  293. }
  294. return null; // Valid URL
  295. }
  296. }
  297. // Export for use in other modules
  298. if (typeof module !== 'undefined' && module.exports) {
  299. // Node.js environment
  300. module.exports = URLValidator;
  301. } else {
  302. // Browser environment - attach to window
  303. window.URLValidator = URLValidator;
  304. }