From 52be0cadf7a67cf1dbe8f462451ea4c289f52034 Mon Sep 17 00:00:00 2001 From: gorhill Date: Fri, 18 Oct 2013 17:35:35 -0300 Subject: [PATCH 1/6] using binary search instead of regex --- src/SecondLevelDomains.js | 61 +++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 18 deletions(-) diff --git a/src/SecondLevelDomains.js b/src/SecondLevelDomains.js index 1e28c311..fed81fd2 100644 --- a/src/SecondLevelDomains.js +++ b/src/SecondLevelDomains.js @@ -175,21 +175,49 @@ var SLD = { "za":"ac|agric|alt|bourse|city|co|cybernet|db|edu|gov|grondar|iaccess|imt|inca|landesign|law|mil|net|ngo|nis|nom|olivetti|org|pix|school|tm|web", "zm":"ac|co|com|edu|gov|net|org|sch" }, - // SLD expression for each TLD - //expressions: {}, - // SLD expression for all TLDs - has_expression: null, - is_expression: null, - // validate domain is a known SLD + // http://jsperf.com/long-string-indexof-vs-quickindexof/2 + quickIndexOf: function(s, t) { + var i, j, k; + var left = 1; + var right = s.length - 1; + var sub; + t = ' ' + t + ' '; + while (left < right) { + i = left + right >> 1; + j = s.lastIndexOf(' ', i); + k = s.indexOf(' ', j+1) + 1; + sub = s.slice(j, k); + if ( t < sub ) { + right = j; + } else if ( t > sub ) { + left = k; + } else { + return j; + } + } + return -1; + }, has: function(domain) { - return !!domain.match(SLD.has_expression); + var dd = domain.split('.'); + if ( dd.length < 3 ) { return false; } + return SLD.quickIndexOf(SLD.all, dd.reverse().slice(0,2).join('.')) >= 0; }, is: function(domain) { - return !!domain.match(SLD.is_expression); + var dd = domain.split('.'); + if ( dd.length > 2 ) { return false; } + return SLD.quickIndexOf(SLD.all, dd.reverse().join('.')) >= 0; }, get: function(domain) { - var t = domain.match(SLD.has_expression); - return t && t[1] || null; + var dd = domain.split('.'); + if ( dd.length < 3 ) { + return null; + } + var i = SLD.quickIndexOf(SLD.all, dd.reverse().slice(0,2).join('.')); + if ( i < 0 ) { + return null; + } + var j = SLD.all.indexOf(' ', i+1); + return SLD.all.slice(i+1,j).split('.').reverse().join('.'); }, noConflict: function(){ if (root.SecondLevelDomains === this) { @@ -198,19 +226,16 @@ var SLD = { return this; }, init: function() { - var t = ''; + var t = []; for (var tld in SLD.list) { if (!hasOwn.call(SLD.list, tld)) { continue; } - - var expression = '(' + SLD.list[tld] + ')\.' + tld; - //SLD.expressions[tld] = new RegExp('\.' + expression + '$', 'i'); - t += '|(' + expression + ')'; + t = t.concat(SLD.list[tld].split('|').map(function(sld){ + return tld + '.' + sld; + })); } - - SLD.has_expression = new RegExp('\\.(' + t.substr(1) + ')$', 'i'); - SLD.is_expression = new RegExp('^(' + t.substr(1) + ')$', 'i'); + SLD.all = ' ' + t.sort().join(' ') + ' '; } }; From d3c087942b9003671f86096d92e0919cce5d2874 Mon Sep 17 00:00:00 2001 From: gorhill Date: Wed, 23 Oct 2013 11:46:11 -0200 Subject: [PATCH 2/6] using binary search to find valid SLD --- src/SecondLevelDomains.js | 34 +++++++++++++++++++--------------- test/test.js | 25 +++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/src/SecondLevelDomains.js b/src/SecondLevelDomains.js index fed81fd2..cb4a3d34 100644 --- a/src/SecondLevelDomains.js +++ b/src/SecondLevelDomains.js @@ -175,24 +175,28 @@ var SLD = { "za":"ac|agric|alt|bourse|city|co|cybernet|db|edu|gov|grondar|iaccess|imt|inca|landesign|law|mil|net|ngo|nis|nom|olivetti|org|pix|school|tm|web", "zm":"ac|co|com|edu|gov|net|org|sch" }, - // http://jsperf.com/long-string-indexof-vs-quickindexof/2 - quickIndexOf: function(s, t) { - var i, j, k; + // http://jsperf.com/uri-js-sld-regex-vs-binary-search + quickIndexOf: function(haystack, needle) { + var midpoint, start, end; + var straw; var left = 1; - var right = s.length - 1; - var sub; - t = ' ' + t + ' '; + var right = haystack.length - 1; + needle = ' ' + needle + ' '; while (left < right) { - i = left + right >> 1; - j = s.lastIndexOf(' ', i); - k = s.indexOf(' ', j+1) + 1; - sub = s.slice(j, k); - if ( t < sub ) { - right = j; - } else if ( t > sub ) { - left = k; + // find midpoint: bitwise shift right allows us to divide by 2 + // and obtain an integer as a result without using Math.floor() + midpoint = left + right >> 1; + // there is a straw at midpoint, find its start and end in order + // to extract it whole + start = haystack.lastIndexOf(' ', midpoint); + end = haystack.indexOf(' ', start+1) + 1; + straw = haystack.slice(start, end); + if ( needle < straw ) { + right = start; + } else if ( needle > straw ) { + left = end; } else { - return j; + return start; // Oh, that's not a straw, that's our needle! } } return -1; diff --git a/test/test.js b/test/test.js index b9d9c3b7..28f86d69 100644 --- a/test/test.js +++ b/test/test.js @@ -463,6 +463,31 @@ test("tld", function() { equal(u.tld(), "se", "se tld"); }); +test("sld", function() { + // Lets just test them all.. + // Calling URI.is(), URI.domain(), URI.subdomain() allows us to indirectly + // test SLD.has(), SLD.is() and SLD.get() + var u = new URI("http://www.example.org/foo.html"); + equal(u.is("sld"), false, "is not sld"); + var list = SecondLevelDomains.list; + var tlds = Object.keys(SecondLevelDomains.list); + var iTld = tlds.length; + var tld, slds, sld, iSld; + while ( iTld-- ) { + tld = tlds[iTld]; + slds = list[tld].split("|"); + iSld = slds.length; + while ( iSld-- ) { + sld = slds[iSld] + '.' + tld; + u.hostname("www.example." + sld); + equal(u.is("sld"), true, "is sld"); + equal(u.domain(), "example." + sld, "domain is example." + sld); + equal(u.subdomain(), "www", "subdomain is www"); + u.hostname('www.example.' + tld); + equal(u.is("sld"), false, "is not sld"); + } + } +}); test("directory", function() { var u = new URI("http://www.example.org/some/directory/foo.html"); u.directory("/"); From 098744c30a95ee7830c8dc9dcd78361d5efa7e51 Mon Sep 17 00:00:00 2001 From: gorhill Date: Wed, 23 Oct 2013 11:52:19 -0200 Subject: [PATCH 3/6] use local var --- test/test.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test.js b/test/test.js index 28f86d69..782b0679 100644 --- a/test/test.js +++ b/test/test.js @@ -470,7 +470,7 @@ test("sld", function() { var u = new URI("http://www.example.org/foo.html"); equal(u.is("sld"), false, "is not sld"); var list = SecondLevelDomains.list; - var tlds = Object.keys(SecondLevelDomains.list); + var tlds = Object.keys(list); var iTld = tlds.length; var tld, slds, sld, iSld; while ( iTld-- ) { From 2f36eac407fddbef165fec48ea3f913f5ae5a951 Mon Sep 17 00:00:00 2001 From: gorhill Date: Fri, 25 Oct 2013 15:34:41 -0200 Subject: [PATCH 4/6] performance improvement of SLD.had, SLD.is and SLD.get --- src/SecondLevelDomains.js | 344 +++++++++++++++++--------------------- test/test.js | 7 +- 2 files changed, 160 insertions(+), 191 deletions(-) diff --git a/src/SecondLevelDomains.js b/src/SecondLevelDomains.js index cb4a3d34..7454563b 100644 --- a/src/SecondLevelDomains.js +++ b/src/SecondLevelDomains.js @@ -41,209 +41,175 @@ var SLD = { // issues browser have to deal with (SOP for cookies, etc) - but is way overboard for URI.js // ---- list: { - "ac":"com|gov|mil|net|org", - "ae":"ac|co|gov|mil|name|net|org|pro|sch", - "af":"com|edu|gov|net|org", - "al":"com|edu|gov|mil|net|org", - "ao":"co|ed|gv|it|og|pb", - "ar":"com|edu|gob|gov|int|mil|net|org|tur", - "at":"ac|co|gv|or", - "au":"asn|com|csiro|edu|gov|id|net|org", - "ba":"co|com|edu|gov|mil|net|org|rs|unbi|unmo|unsa|untz|unze", - "bb":"biz|co|com|edu|gov|info|net|org|store|tv", - "bh":"biz|cc|com|edu|gov|info|net|org", - "bn":"com|edu|gov|net|org", - "bo":"com|edu|gob|gov|int|mil|net|org|tv", - "br":"adm|adv|agr|am|arq|art|ato|b|bio|blog|bmd|cim|cng|cnt|com|coop|ecn|edu|eng|esp|etc|eti|far|flog|fm|fnd|fot|fst|g12|ggf|gov|imb|ind|inf|jor|jus|lel|mat|med|mil|mus|net|nom|not|ntr|odo|org|ppg|pro|psc|psi|qsl|rec|slg|srv|tmp|trd|tur|tv|vet|vlog|wiki|zlg", - "bs":"com|edu|gov|net|org", - "bz":"du|et|om|ov|rg", - "ca":"ab|bc|mb|nb|nf|nl|ns|nt|nu|on|pe|qc|sk|yk", - "ck":"biz|co|edu|gen|gov|info|net|org", - "cn":"ac|ah|bj|com|cq|edu|fj|gd|gov|gs|gx|gz|ha|hb|he|hi|hl|hn|jl|js|jx|ln|mil|net|nm|nx|org|qh|sc|sd|sh|sn|sx|tj|tw|xj|xz|yn|zj", - "co":"com|edu|gov|mil|net|nom|org", - "cr":"ac|c|co|ed|fi|go|or|sa", - "cy":"ac|biz|com|ekloges|gov|ltd|name|net|org|parliament|press|pro|tm", - "do":"art|com|edu|gob|gov|mil|net|org|sld|web", - "dz":"art|asso|com|edu|gov|net|org|pol", - "ec":"com|edu|fin|gov|info|med|mil|net|org|pro", - "eg":"com|edu|eun|gov|mil|name|net|org|sci", - "er":"com|edu|gov|ind|mil|net|org|rochest|w", - "es":"com|edu|gob|nom|org", - "et":"biz|com|edu|gov|info|name|net|org", - "fj":"ac|biz|com|info|mil|name|net|org|pro", - "fk":"ac|co|gov|net|nom|org", - "fr":"asso|com|f|gouv|nom|prd|presse|tm", - "gg":"co|net|org", - "gh":"com|edu|gov|mil|org", - "gn":"ac|com|gov|net|org", - "gr":"com|edu|gov|mil|net|org", - "gt":"com|edu|gob|ind|mil|net|org", - "gu":"com|edu|gov|net|org", - "hk":"com|edu|gov|idv|net|org", - "id":"ac|co|go|mil|net|or|sch|web", - "il":"ac|co|gov|idf|k12|muni|net|org", - "in":"ac|co|edu|ernet|firm|gen|gov|i|ind|mil|net|nic|org|res", - "iq":"com|edu|gov|i|mil|net|org", - "ir":"ac|co|dnssec|gov|i|id|net|org|sch", - "it":"edu|gov", - "je":"co|net|org", - "jo":"com|edu|gov|mil|name|net|org|sch", - "jp":"ac|ad|co|ed|go|gr|lg|ne|or", - "ke":"ac|co|go|info|me|mobi|ne|or|sc", - "kh":"com|edu|gov|mil|net|org|per", - "ki":"biz|com|de|edu|gov|info|mob|net|org|tel", - "km":"asso|com|coop|edu|gouv|k|medecin|mil|nom|notaires|pharmaciens|presse|tm|veterinaire", - "kn":"edu|gov|net|org", - "kr":"ac|busan|chungbuk|chungnam|co|daegu|daejeon|es|gangwon|go|gwangju|gyeongbuk|gyeonggi|gyeongnam|hs|incheon|jeju|jeonbuk|jeonnam|k|kg|mil|ms|ne|or|pe|re|sc|seoul|ulsan", - "kw":"com|edu|gov|net|org", - "ky":"com|edu|gov|net|org", - "kz":"com|edu|gov|mil|net|org", - "lb":"com|edu|gov|net|org", - "lk":"assn|com|edu|gov|grp|hotel|int|ltd|net|ngo|org|sch|soc|web", - "lr":"com|edu|gov|net|org", - "lv":"asn|com|conf|edu|gov|id|mil|net|org", - "ly":"com|edu|gov|id|med|net|org|plc|sch", - "ma":"ac|co|gov|m|net|org|press", - "mc":"asso|tm", - "me":"ac|co|edu|gov|its|net|org|priv", - "mg":"com|edu|gov|mil|nom|org|prd|tm", - "mk":"com|edu|gov|inf|name|net|org|pro", - "ml":"com|edu|gov|net|org|presse", - "mn":"edu|gov|org", - "mo":"com|edu|gov|net|org", - "mt":"com|edu|gov|net|org", - "mv":"aero|biz|com|coop|edu|gov|info|int|mil|museum|name|net|org|pro", - "mw":"ac|co|com|coop|edu|gov|int|museum|net|org", - "mx":"com|edu|gob|net|org", - "my":"com|edu|gov|mil|name|net|org|sch", - "nf":"arts|com|firm|info|net|other|per|rec|store|web", - "ng":"biz|com|edu|gov|mil|mobi|name|net|org|sch", - "ni":"ac|co|com|edu|gob|mil|net|nom|org", - "np":"com|edu|gov|mil|net|org", - "nr":"biz|com|edu|gov|info|net|org", - "om":"ac|biz|co|com|edu|gov|med|mil|museum|net|org|pro|sch", - "pe":"com|edu|gob|mil|net|nom|org|sld", - "ph":"com|edu|gov|i|mil|net|ngo|org", - "pk":"biz|com|edu|fam|gob|gok|gon|gop|gos|gov|net|org|web", - "pl":"art|bialystok|biz|com|edu|gda|gdansk|gorzow|gov|info|katowice|krakow|lodz|lublin|mil|net|ngo|olsztyn|org|poznan|pwr|radom|slupsk|szczecin|torun|warszawa|waw|wroc|wroclaw|zgora", - "pr":"ac|biz|com|edu|est|gov|info|isla|name|net|org|pro|prof", - "ps":"com|edu|gov|net|org|plo|sec", - "pw":"belau|co|ed|go|ne|or", - "ro":"arts|com|firm|info|nom|nt|org|rec|store|tm|www", - "rs":"ac|co|edu|gov|in|org", - "sb":"com|edu|gov|net|org", - "sc":"com|edu|gov|net|org", - "sh":"co|com|edu|gov|net|nom|org", - "sl":"com|edu|gov|net|org", - "st":"co|com|consulado|edu|embaixada|gov|mil|net|org|principe|saotome|store", - "sv":"com|edu|gob|org|red", - "sz":"ac|co|org", - "tr":"av|bbs|bel|biz|com|dr|edu|gen|gov|info|k12|name|net|org|pol|tel|tsk|tv|web", - "tt":"aero|biz|cat|co|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel", - "tw":"club|com|ebiz|edu|game|gov|idv|mil|net|org", - "mu":"ac|co|com|gov|net|or|org", - "mz":"ac|co|edu|gov|org", - "na":"co|com", - "nz":"ac|co|cri|geek|gen|govt|health|iwi|maori|mil|net|org|parliament|school", - "pa":"abo|ac|com|edu|gob|ing|med|net|nom|org|sld", - "pt":"com|edu|gov|int|net|nome|org|publ", - "py":"com|edu|gov|mil|net|org", - "qa":"com|edu|gov|mil|net|org", - "re":"asso|com|nom", - "ru":"ac|adygeya|altai|amur|arkhangelsk|astrakhan|bashkiria|belgorod|bir|bryansk|buryatia|cbg|chel|chelyabinsk|chita|chukotka|chuvashia|com|dagestan|e-burg|edu|gov|grozny|int|irkutsk|ivanovo|izhevsk|jar|joshkar-ola|kalmykia|kaluga|kamchatka|karelia|kazan|kchr|kemerovo|khabarovsk|khakassia|khv|kirov|koenig|komi|kostroma|kranoyarsk|kuban|kurgan|kursk|lipetsk|magadan|mari|mari-el|marine|mil|mordovia|mosreg|msk|murmansk|nalchik|net|nnov|nov|novosibirsk|nsk|omsk|orenburg|org|oryol|penza|perm|pp|pskov|ptz|rnd|ryazan|sakhalin|samara|saratov|simbirsk|smolensk|spb|stavropol|stv|surgut|tambov|tatarstan|tom|tomsk|tsaritsyn|tsk|tula|tuva|tver|tyumen|udm|udmurtia|ulan-ude|vladikavkaz|vladimir|vladivostok|volgograd|vologda|voronezh|vrn|vyatka|yakutia|yamal|yekaterinburg|yuzhno-sakhalinsk", - "rw":"ac|co|com|edu|gouv|gov|int|mil|net", - "sa":"com|edu|gov|med|net|org|pub|sch", - "sd":"com|edu|gov|info|med|net|org|tv", - "se":"a|ac|b|bd|c|d|e|f|g|h|i|k|l|m|n|o|org|p|parti|pp|press|r|s|t|tm|u|w|x|y|z", - "sg":"com|edu|gov|idn|net|org|per", - "sn":"art|com|edu|gouv|org|perso|univ", - "sy":"com|edu|gov|mil|net|news|org", - "th":"ac|co|go|in|mi|net|or", - "tj":"ac|biz|co|com|edu|go|gov|info|int|mil|name|net|nic|org|test|web", - "tn":"agrinet|com|defense|edunet|ens|fin|gov|ind|info|intl|mincom|nat|net|org|perso|rnrt|rns|rnu|tourism", - "tz":"ac|co|go|ne|or", - "ua":"biz|cherkassy|chernigov|chernovtsy|ck|cn|co|com|crimea|cv|dn|dnepropetrovsk|donetsk|dp|edu|gov|if|in|ivano-frankivsk|kh|kharkov|kherson|khmelnitskiy|kiev|kirovograd|km|kr|ks|kv|lg|lugansk|lutsk|lviv|me|mk|net|nikolaev|od|odessa|org|pl|poltava|pp|rovno|rv|sebastopol|sumy|te|ternopil|uzhgorod|vinnica|vn|zaporizhzhe|zhitomir|zp|zt", - "ug":"ac|co|go|ne|or|org|sc", - "uk":"ac|bl|british-library|co|cym|gov|govt|icnet|jet|lea|ltd|me|mil|mod|national-library-scotland|nel|net|nhs|nic|nls|org|orgn|parliament|plc|police|sch|scot|soc", - "us":"dni|fed|isa|kids|nsn", - "uy":"com|edu|gub|mil|net|org", - "ve":"co|com|edu|gob|info|mil|net|org|web", - "vi":"co|com|k12|net|org", - "vn":"ac|biz|com|edu|gov|health|info|int|name|net|org|pro", - "ye":"co|com|gov|ltd|me|net|org|plc", - "yu":"ac|co|edu|gov|org", - "za":"ac|agric|alt|bourse|city|co|cybernet|db|edu|gov|grondar|iaccess|imt|inca|landesign|law|mil|net|ngo|nis|nom|olivetti|org|pix|school|tm|web", - "zm":"ac|co|com|edu|gov|net|org|sch" - }, - // http://jsperf.com/uri-js-sld-regex-vs-binary-search - quickIndexOf: function(haystack, needle) { - var midpoint, start, end; - var straw; - var left = 1; - var right = haystack.length - 1; - needle = ' ' + needle + ' '; - while (left < right) { - // find midpoint: bitwise shift right allows us to divide by 2 - // and obtain an integer as a result without using Math.floor() - midpoint = left + right >> 1; - // there is a straw at midpoint, find its start and end in order - // to extract it whole - start = haystack.lastIndexOf(' ', midpoint); - end = haystack.indexOf(' ', start+1) + 1; - straw = haystack.slice(start, end); - if ( needle < straw ) { - right = start; - } else if ( needle > straw ) { - left = end; - } else { - return start; // Oh, that's not a straw, that's our needle! - } - } - return -1; + "ac":" com gov mil net org ", + "ae":" ac co gov mil name net org pro sch ", + "af":" com edu gov net org ", + "al":" com edu gov mil net org ", + "ao":" co ed gv it og pb ", + "ar":" com edu gob gov int mil net org tur ", + "at":" ac co gv or ", + "au":" asn com csiro edu gov id net org ", + "ba":" co com edu gov mil net org rs unbi unmo unsa untz unze ", + "bb":" biz co com edu gov info net org store tv ", + "bh":" biz cc com edu gov info net org ", + "bn":" com edu gov net org ", + "bo":" com edu gob gov int mil net org tv ", + "br":" adm adv agr am arq art ato b bio blog bmd cim cng cnt com coop ecn edu eng esp etc eti far flog fm fnd fot fst g12 ggf gov imb ind inf jor jus lel mat med mil mus net nom not ntr odo org ppg pro psc psi qsl rec slg srv tmp trd tur tv vet vlog wiki zlg ", + "bs":" com edu gov net org ", + "bz":" du et om ov rg ", + "ca":" ab bc mb nb nf nl ns nt nu on pe qc sk yk ", + "ck":" biz co edu gen gov info net org ", + "cn":" ac ah bj com cq edu fj gd gov gs gx gz ha hb he hi hl hn jl js jx ln mil net nm nx org qh sc sd sh sn sx tj tw xj xz yn zj ", + "co":" com edu gov mil net nom org ", + "cr":" ac c co ed fi go or sa ", + "cy":" ac biz com ekloges gov ltd name net org parliament press pro tm ", + "do":" art com edu gob gov mil net org sld web ", + "dz":" art asso com edu gov net org pol ", + "ec":" com edu fin gov info med mil net org pro ", + "eg":" com edu eun gov mil name net org sci ", + "er":" com edu gov ind mil net org rochest w ", + "es":" com edu gob nom org ", + "et":" biz com edu gov info name net org ", + "fj":" ac biz com info mil name net org pro ", + "fk":" ac co gov net nom org ", + "fr":" asso com f gouv nom prd presse tm ", + "gg":" co net org ", + "gh":" com edu gov mil org ", + "gn":" ac com gov net org ", + "gr":" com edu gov mil net org ", + "gt":" com edu gob ind mil net org ", + "gu":" com edu gov net org ", + "hk":" com edu gov idv net org ", + "id":" ac co go mil net or sch web ", + "il":" ac co gov idf k12 muni net org ", + "in":" ac co edu ernet firm gen gov i ind mil net nic org res ", + "iq":" com edu gov i mil net org ", + "ir":" ac co dnssec gov i id net org sch ", + "it":" edu gov ", + "je":" co net org ", + "jo":" com edu gov mil name net org sch ", + "jp":" ac ad co ed go gr lg ne or ", + "ke":" ac co go info me mobi ne or sc ", + "kh":" com edu gov mil net org per ", + "ki":" biz com de edu gov info mob net org tel ", + "km":" asso com coop edu gouv k medecin mil nom notaires pharmaciens presse tm veterinaire ", + "kn":" edu gov net org ", + "kr":" ac busan chungbuk chungnam co daegu daejeon es gangwon go gwangju gyeongbuk gyeonggi gyeongnam hs incheon jeju jeonbuk jeonnam k kg mil ms ne or pe re sc seoul ulsan ", + "kw":" com edu gov net org ", + "ky":" com edu gov net org ", + "kz":" com edu gov mil net org ", + "lb":" com edu gov net org ", + "lk":" assn com edu gov grp hotel int ltd net ngo org sch soc web ", + "lr":" com edu gov net org ", + "lv":" asn com conf edu gov id mil net org ", + "ly":" com edu gov id med net org plc sch ", + "ma":" ac co gov m net org press ", + "mc":" asso tm ", + "me":" ac co edu gov its net org priv ", + "mg":" com edu gov mil nom org prd tm ", + "mk":" com edu gov inf name net org pro ", + "ml":" com edu gov net org presse ", + "mn":" edu gov org ", + "mo":" com edu gov net org ", + "mt":" com edu gov net org ", + "mv":" aero biz com coop edu gov info int mil museum name net org pro ", + "mw":" ac co com coop edu gov int museum net org ", + "mx":" com edu gob net org ", + "my":" com edu gov mil name net org sch ", + "nf":" arts com firm info net other per rec store web ", + "ng":" biz com edu gov mil mobi name net org sch ", + "ni":" ac co com edu gob mil net nom org ", + "np":" com edu gov mil net org ", + "nr":" biz com edu gov info net org ", + "om":" ac biz co com edu gov med mil museum net org pro sch ", + "pe":" com edu gob mil net nom org sld ", + "ph":" com edu gov i mil net ngo org ", + "pk":" biz com edu fam gob gok gon gop gos gov net org web ", + "pl":" art bialystok biz com edu gda gdansk gorzow gov info katowice krakow lodz lublin mil net ngo olsztyn org poznan pwr radom slupsk szczecin torun warszawa waw wroc wroclaw zgora ", + "pr":" ac biz com edu est gov info isla name net org pro prof ", + "ps":" com edu gov net org plo sec ", + "pw":" belau co ed go ne or ", + "ro":" arts com firm info nom nt org rec store tm www ", + "rs":" ac co edu gov in org ", + "sb":" com edu gov net org ", + "sc":" com edu gov net org ", + "sh":" co com edu gov net nom org ", + "sl":" com edu gov net org ", + "st":" co com consulado edu embaixada gov mil net org principe saotome store ", + "sv":" com edu gob org red ", + "sz":" ac co org ", + "tr":" av bbs bel biz com dr edu gen gov info k12 name net org pol tel tsk tv web ", + "tt":" aero biz cat co com coop edu gov info int jobs mil mobi museum name net org pro tel travel ", + "tw":" club com ebiz edu game gov idv mil net org ", + "mu":" ac co com gov net or org ", + "mz":" ac co edu gov org ", + "na":" co com ", + "nz":" ac co cri geek gen govt health iwi maori mil net org parliament school ", + "pa":" abo ac com edu gob ing med net nom org sld ", + "pt":" com edu gov int net nome org publ ", + "py":" com edu gov mil net org ", + "qa":" com edu gov mil net org ", + "re":" asso com nom ", + "ru":" ac adygeya altai amur arkhangelsk astrakhan bashkiria belgorod bir bryansk buryatia cbg chel chelyabinsk chita chukotka chuvashia com dagestan e-burg edu gov grozny int irkutsk ivanovo izhevsk jar joshkar-ola kalmykia kaluga kamchatka karelia kazan kchr kemerovo khabarovsk khakassia khv kirov koenig komi kostroma kranoyarsk kuban kurgan kursk lipetsk magadan mari mari-el marine mil mordovia mosreg msk murmansk nalchik net nnov nov novosibirsk nsk omsk orenburg org oryol penza perm pp pskov ptz rnd ryazan sakhalin samara saratov simbirsk smolensk spb stavropol stv surgut tambov tatarstan tom tomsk tsaritsyn tsk tula tuva tver tyumen udm udmurtia ulan-ude vladikavkaz vladimir vladivostok volgograd vologda voronezh vrn vyatka yakutia yamal yekaterinburg yuzhno-sakhalinsk ", + "rw":" ac co com edu gouv gov int mil net ", + "sa":" com edu gov med net org pub sch ", + "sd":" com edu gov info med net org tv ", + "se":" a ac b bd c d e f g h i k l m n o org p parti pp press r s t tm u w x y z ", + "sg":" com edu gov idn net org per ", + "sn":" art com edu gouv org perso univ ", + "sy":" com edu gov mil net news org ", + "th":" ac co go in mi net or ", + "tj":" ac biz co com edu go gov info int mil name net nic org test web ", + "tn":" agrinet com defense edunet ens fin gov ind info intl mincom nat net org perso rnrt rns rnu tourism ", + "tz":" ac co go ne or ", + "ua":" biz cherkassy chernigov chernovtsy ck cn co com crimea cv dn dnepropetrovsk donetsk dp edu gov if in ivano-frankivsk kh kharkov kherson khmelnitskiy kiev kirovograd km kr ks kv lg lugansk lutsk lviv me mk net nikolaev od odessa org pl poltava pp rovno rv sebastopol sumy te ternopil uzhgorod vinnica vn zaporizhzhe zhitomir zp zt ", + "ug":" ac co go ne or org sc ", + "uk":" ac bl british-library co cym gov govt icnet jet lea ltd me mil mod national-library-scotland nel net nhs nic nls org orgn parliament plc police sch scot soc ", + "us":" dni fed isa kids nsn ", + "uy":" com edu gub mil net org ", + "ve":" co com edu gob info mil net org web ", + "vi":" co com k12 net org ", + "vn":" ac biz com edu gov health info int name net org pro ", + "ye":" co com gov ltd me net org plc ", + "yu":" ac co edu gov org ", + "za":" ac agric alt bourse city co cybernet db edu gov grondar iaccess imt inca landesign law mil net ngo nis nom olivetti org pix school tm web ", + "zm":" ac co com edu gov net org sch " }, has: function(domain) { - var dd = domain.split('.'); - if ( dd.length < 3 ) { return false; } - return SLD.quickIndexOf(SLD.all, dd.reverse().slice(0,2).join('.')) >= 0; + var tldOffset = domain.lastIndexOf('.'); + if ( tldOffset <= 0 || tldOffset >= (domain.length-1) ) { return false; } + var sldOffset = domain.lastIndexOf('.', tldOffset-1); + if ( sldOffset <= 0 || sldOffset >= (tldOffset-1) ) { return false; } + var sldList = SLD.list[domain.slice(tldOffset+1)]; + if ( !sldList ) { return false; } + return sldList.indexOf(' ' + domain.slice(sldOffset+1, tldOffset) + ' ') >= 0; }, is: function(domain) { - var dd = domain.split('.'); - if ( dd.length > 2 ) { return false; } - return SLD.quickIndexOf(SLD.all, dd.reverse().join('.')) >= 0; + var tldOffset = domain.lastIndexOf('.'); + if ( tldOffset <= 0 || tldOffset >= (domain.length-1) ) { return false; } + var sldOffset = domain.lastIndexOf('.', tldOffset-1); + if ( sldOffset >= 0 ) { return false; } + var sldList = SLD.list[domain.slice(tldOffset+1)]; + if ( !sldList ) { return false; } + return sldList.indexOf(' ' + domain.slice(0, tldOffset) + ' ') >= 0; }, get: function(domain) { - var dd = domain.split('.'); - if ( dd.length < 3 ) { - return null; - } - var i = SLD.quickIndexOf(SLD.all, dd.reverse().slice(0,2).join('.')); - if ( i < 0 ) { - return null; - } - var j = SLD.all.indexOf(' ', i+1); - return SLD.all.slice(i+1,j).split('.').reverse().join('.'); + var tldOffset = domain.lastIndexOf('.'); + if ( tldOffset <= 0 || tldOffset >= (domain.length-1) ) { return null; } + var sldOffset = domain.lastIndexOf('.', tldOffset-1); + if ( sldOffset <= 0 || sldOffset >= (tldOffset-1) ) { return null; } + var sldList = SLD.list[domain.slice(tldOffset+1)]; + if ( !sldList ) { return null; } + if ( sldList.indexOf(' ' + domain.slice(sldOffset+1, tldOffset) + ' ') < 0 ) { return null; } + return domain.slice(sldOffset+1); }, noConflict: function(){ if (root.SecondLevelDomains === this) { root.SecondLevelDomains = _SecondLevelDomains; } return this; - }, - init: function() { - var t = []; - for (var tld in SLD.list) { - if (!hasOwn.call(SLD.list, tld)) { - continue; - } - t = t.concat(SLD.list[tld].split('|').map(function(sld){ - return tld + '.' + sld; - })); - } - SLD.all = ' ' + t.sort().join(' ') + ' '; } }; -SLD.init(); - return SLD; })); diff --git a/test/test.js b/test/test.js index 782b0679..4e8842b1 100644 --- a/test/test.js +++ b/test/test.js @@ -475,10 +475,13 @@ test("sld", function() { var tld, slds, sld, iSld; while ( iTld-- ) { tld = tlds[iTld]; - slds = list[tld].split("|"); + // We trim and split on whitespaces, so if someone mistakenly uses + // more than one space to separate the SLD fragments, it will cause + // the tests to fail. + slds = list[tld].trim().split(/\s+/); iSld = slds.length; while ( iSld-- ) { - sld = slds[iSld] + '.' + tld; + sld = slds[iSld].trim() + '.' + tld; u.hostname("www.example." + sld); equal(u.is("sld"), true, "is sld"); equal(u.domain(), "example." + sld, "domain is example." + sld); From 86f58f938c6af542026f7015ee2ed9839274857b Mon Sep 17 00:00:00 2001 From: gorhill Date: Fri, 25 Oct 2013 15:49:50 -0200 Subject: [PATCH 5/6] added link to jsperf test --- src/SecondLevelDomains.js | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/SecondLevelDomains.js b/src/SecondLevelDomains.js index 7454563b..ad9e1641 100644 --- a/src/SecondLevelDomains.js +++ b/src/SecondLevelDomains.js @@ -175,6 +175,11 @@ var SLD = { "za":" ac agric alt bourse city co cybernet db edu gov grondar iaccess imt inca landesign law mil net ngo nis nom olivetti org pix school tm web ", "zm":" ac co com edu gov net org sch " }, + // gorhill 2013-10-25: Using indexOf() instead Regexp(). Significant boost + // in both performance and memory footprint. No initialization required. + // http://jsperf.com/uri-js-sld-regex-vs-binary-search/4 + // Following methods use lastIndexOf() rather than array.split() in order + // to avoid any memory allocations. has: function(domain) { var tldOffset = domain.lastIndexOf('.'); if ( tldOffset <= 0 || tldOffset >= (domain.length-1) ) { return false; } From 917324b66b87673d19b0b955e2bfe6a607abef05 Mon Sep 17 00:00:00 2001 From: gorhill Date: Mon, 28 Oct 2013 09:19:23 -0200 Subject: [PATCH 6/6] coding style --- src/SecondLevelDomains.js | 56 ++++++++++++++++++++++++++------------- test/test.js | 7 ++--- 2 files changed, 40 insertions(+), 23 deletions(-) diff --git a/src/SecondLevelDomains.js b/src/SecondLevelDomains.js index ad9e1641..e646184d 100644 --- a/src/SecondLevelDomains.js +++ b/src/SecondLevelDomains.js @@ -181,31 +181,51 @@ var SLD = { // Following methods use lastIndexOf() rather than array.split() in order // to avoid any memory allocations. has: function(domain) { - var tldOffset = domain.lastIndexOf('.'); - if ( tldOffset <= 0 || tldOffset >= (domain.length-1) ) { return false; } - var sldOffset = domain.lastIndexOf('.', tldOffset-1); - if ( sldOffset <= 0 || sldOffset >= (tldOffset-1) ) { return false; } + var tldOffset = domain.lastIndexOf("."); + if (tldOffset <= 0 || tldOffset >= (domain.length-1)) { + return false; + } + var sldOffset = domain.lastIndexOf(".", tldOffset-1); + if (sldOffset <= 0 || sldOffset >= (tldOffset-1)) { + return false; + } var sldList = SLD.list[domain.slice(tldOffset+1)]; - if ( !sldList ) { return false; } - return sldList.indexOf(' ' + domain.slice(sldOffset+1, tldOffset) + ' ') >= 0; + if (!sldList) { + return false; + } + return sldList.indexOf(" " + domain.slice(sldOffset+1, tldOffset) + " ") >= 0; }, is: function(domain) { - var tldOffset = domain.lastIndexOf('.'); - if ( tldOffset <= 0 || tldOffset >= (domain.length-1) ) { return false; } - var sldOffset = domain.lastIndexOf('.', tldOffset-1); - if ( sldOffset >= 0 ) { return false; } + var tldOffset = domain.lastIndexOf("."); + if (tldOffset <= 0 || tldOffset >= (domain.length-1)) { + return false; + } + var sldOffset = domain.lastIndexOf(".", tldOffset-1); + if (sldOffset >= 0) { + return false; + } var sldList = SLD.list[domain.slice(tldOffset+1)]; - if ( !sldList ) { return false; } - return sldList.indexOf(' ' + domain.slice(0, tldOffset) + ' ') >= 0; + if (!sldList) { + return false; + } + return sldList.indexOf(" " + domain.slice(0, tldOffset) + " ") >= 0; }, get: function(domain) { - var tldOffset = domain.lastIndexOf('.'); - if ( tldOffset <= 0 || tldOffset >= (domain.length-1) ) { return null; } - var sldOffset = domain.lastIndexOf('.', tldOffset-1); - if ( sldOffset <= 0 || sldOffset >= (tldOffset-1) ) { return null; } + var tldOffset = domain.lastIndexOf("."); + if (tldOffset <= 0 || tldOffset >= (domain.length-1)) { + return null; + } + var sldOffset = domain.lastIndexOf(".", tldOffset-1); + if (sldOffset <= 0 || sldOffset >= (tldOffset-1)) { + return null; + } var sldList = SLD.list[domain.slice(tldOffset+1)]; - if ( !sldList ) { return null; } - if ( sldList.indexOf(' ' + domain.slice(sldOffset+1, tldOffset) + ' ') < 0 ) { return null; } + if (!sldList) { + return null; + } + if (sldList.indexOf(" " + domain.slice(sldOffset+1, tldOffset) + " ") < 0) { + return null; + } return domain.slice(sldOffset+1); }, noConflict: function(){ diff --git a/test/test.js b/test/test.js index 4e8842b1..c0093471 100644 --- a/test/test.js +++ b/test/test.js @@ -475,18 +475,15 @@ test("sld", function() { var tld, slds, sld, iSld; while ( iTld-- ) { tld = tlds[iTld]; - // We trim and split on whitespaces, so if someone mistakenly uses - // more than one space to separate the SLD fragments, it will cause - // the tests to fail. slds = list[tld].trim().split(/\s+/); iSld = slds.length; while ( iSld-- ) { - sld = slds[iSld].trim() + '.' + tld; + sld = slds[iSld].trim() + "." + tld; u.hostname("www.example." + sld); equal(u.is("sld"), true, "is sld"); equal(u.domain(), "example." + sld, "domain is example." + sld); equal(u.subdomain(), "www", "subdomain is www"); - u.hostname('www.example.' + tld); + u.hostname("www.example." + tld); equal(u.is("sld"), false, "is not sld"); } }