Even faster String.prototype.trim() implementation in JavaScript.

I’ll begin with a quote:

Some people, when confronted with a problem, think “I know, I’ll use regular expressions.” Now they have two problems.

Those are the words of one of my favorite hackers, Jamie Zawinski.

Most JavaScript trim() implementations you see around are based on regular expressions and work fairly well for infrequent use on short strings. Some of them are well-crafted, but I think plain old loops can do better. Back in 2007, Steve Levithan covered some really fast implementations of trim() followed by a few more articles by others, including Luca Guidi.

Many of these implementations were based on regular expressions and a few were exceptionally brilliant. But did they answer the question of speed? In one of the comments posted on Steve Levithan’s blog there was a little gem written by Michael Lee Finney, and truly that little function proves to be the fastest of all of the implementations mentioned in those articles. Finney claimed to better previous implementations by 20x, and he was right. His code totally smoked the regular-expression based implementations.

The Details

Arrays in JavaScript behave like hash-tables and hence exhibit the property of being sparse.  Finney’s implementation exploits this feature of JavaScript to its fullest.  Here is a lookup table he uses to mark characters as whitespace:

String.whiteSpace = [];
String.whiteSpace[0x0009] = true;
String.whiteSpace[0x000a] = true;
String.whiteSpace[0x000b] = true;
String.whiteSpace[0x000c] = true;
String.whiteSpace[0x000d] = true;
String.whiteSpace[0x0020] = true;
String.whiteSpace[0x0085] = true;
String.whiteSpace[0x00a0] = true;
String.whiteSpace[0x1680] = true;
String.whiteSpace[0x180e] = true;
String.whiteSpace[0x2000] = true;
String.whiteSpace[0x2001] = true;
String.whiteSpace[0x2002] = true;
String.whiteSpace[0x2003] = true;
String.whiteSpace[0x2004] = true;
String.whiteSpace[0x2005] = true;
String.whiteSpace[0x2006] = true;
String.whiteSpace[0x2007] = true;
String.whiteSpace[0x2008] = true;
String.whiteSpace[0x2009] = true;
String.whiteSpace[0x200a] = true;
String.whiteSpace[0x200b] = true;
String.whiteSpace[0x2028] = true;
String.whiteSpace[0x2029] = true;
String.whiteSpace[0x202f] = true;
String.whiteSpace[0x205f] = true;
String.whiteSpace[0x3000] = true;

The implementation of the function simply indexes into this array to check whether any character encountered while traversing the string is valid whitespace. The function runs noticeably faster. Here’s his code:

function trim14(str) {
    var len = str.length, whiteSpace, i;

    if (!len) {
        return str;
    }

    whiteSpace = String.whiteSpace;

    if (len && whiteSpace[str.charCodeAt(len-1)])
    {
        do{
            --len;
        }
        while (len && whiteSpace[str.charCodeAt(len - 1)]);

        if (len && whiteSpace[str.charCodeAt(0)]){
            i = 1;
            while (i < len && whiteSpace[str.charCodeAt(i)]){
                ++i;
            }
        }
        return str.substring(i, len);
    }
    if (len && whiteSpace[str.charCodeAt(0)]) {
        i = 1;
        while (i < len && whiteSpace[str.charAt(i)]){
            ++i;
        }
        return str.substring(i, len);
    }
    return str;
};

Neatly written. I did spot a few unnecessary checks and removed them for a retest. The following code shaved off a couple more milliseconds sometimes.

function trim16(str) {
    var len = str.length, whiteSpace = String.whiteSpace, i = 0;

    if (len) {
        if (whiteSpace[str.charCodeAt(len - 1)]) {
            // Remove from the end.
            while (--len && whiteSpace[str.charCodeAt(len - 1)]);

            // Remove from the beginning.
            if (len && whiteSpace[str.charCodeAt(0)]){
                i = 1;
                while (i < len && whiteSpace[str.charCodeAt(i)]){
                    ++i;  // Keep this here.
                }
            }
            return str.substring(i, len);
        }

        // Remove from the beginning.
        if (whiteSpace[str.charCodeAt(0)]) {
            i = 1;
            while (i < len && whiteSpace[str.charAt(i)]){
                ++i;
            }
            return str.substring(i, len);
        }
    }

    return str;
};

Can this be any better and faster?

I think it can. There’s repetition in the code that could be removed as well.  I rewrote the function to use Finney’s look up table and ended up with this:

function trim17(str){
    var len = str.length;
    if (len){
        var whiteSpace = String.whiteSpace, i = 0;
        while (whiteSpace[str.charCodeAt(--len)]);
        if (++len){
            while (whiteSpace[str.charCodeAt(i)]){ ++i; }
        }
        str = str.substring(i, len);
    }
    return str;
}

Guess what? Here are the benchmark results from a Chrome developer build running on Linux with 100000 iterations:

Original length: 27663
trim10: 363ms (length: 27656)
trim11: 4668ms (length: 27656)
trim12: 4671ms (length: 27656)
trim13: 322ms (length: 27656)
trim14: 197ms (length: 27656)
trim15: 195ms (length: 27656)
trim16: 197ms (length: 27656)
trim17: 187ms (length: 27656)

trim17 is indeed even faster and shorter.

Updates

Nikolay “MadRabbit” V. Nemshilov, the author of RightJS (a beautiful JavaScript library), provides another implementation of the function:

function trim19(str){
    var str = str.replace(/^\s\s*/, ''),
        ws = /\s/,
        i = str.length;
    while (ws.test(str.charAt(--i)));
    return str.slice(0, i + 1);
}

which is faster than many other regexp-based implementations.

I have included that in the benchmark as well and generated a couple graphs.

Average performance of all trim() implementations across major platforms and browsers.
Average performance of all trim() implementations across major platforms and browsers (Smaller is better).
Average performance of the latter half of the set of trim() implementations.
Average performance of the latter half of the set of trim() implementations (Smaller is better).

Feel free to use my code under the terms of the MIT license. As usual, suggestions and constructive criticism are most welcome.

Advertisements

Making the browser download scripts in parallel.

Current browsers download scripts serially.  What that means is that if you use:

<script type="text/javascript" src="foobar.js"></script>
<script type="text/javascript" src="quuxdoo.js"></script>

The browser will not fetch quuxdoo.js until it has finished downloading foobar.js.   In fact, it will not download any other resource until that script is downloaded.  This is really a sad state of affairs.

An existing “solution”

An article “Downloading JavaScript files in Parallel” by Rakesh Pai explains the issue with a couple screenshots displaying the Firebug Net panel.

<script type="text/javascript">
    (function() {
        var s = [
            "/javascripts/script1.js",
            "/javascripts/script2.js",
            "/javascripts/script3.js",
            "/javascripts/script4.js",
            "/javascripts/script5.js"
        ];

        var sc = "script", tp = "text/javascript", sa = "setAttribute";
        for(var i=0, l=s.length; i<l; ++i) {
            if(window.navigator.userAgent.indexOf("MSIE")!==-1 || window.navigator.userAgent.indexOf("WebKit")!==-1) {
                document.writeln("<" + sc + " type=\"" + tp + "\" src=\"" + s[i] + "\" defer></" + sc + ">");
            } else {
                var t=document.createElement(sc);
                t[sa]("src", s[i]);
                t[sa]("type", tp);
                document.getElementsByTagName("head")[0].appendChild(t);
            }
        }
    })();
</script>

He provides a solution that seems to work well, but contains a few things I didn’t like or consider bugs:

  1. Browser sniffing.  Do Firefox and Opera not support the document.write() method? No, really, I’m asking here.  I don’t have a copy of FF 1.5 or 2.0 lying around to verify.  The latest version of Opera does seem to indeed.  And not just that, the script checks for the browser per iteration of the loop, so if you have 10 scripts that you load with this method, the browser check will occur 10 times!
  2. The code tries to optimize for space but ends up being harder to read instead.  This should ideally be the job of a minifier like YUI Compressor, Douglas Crockford’s JSMIN, Dojo Toolkit’s ShrinkSafe, or Dean Edwards’ Packer.  A couple of those space-saving tricks can be used to hint the minifier better while keeping the code readable.
  3. Take HTML 5.  New script element attributes like async and defer aren’t handled.  Both are defined as boolean attributes and async=”true” means the script would execute asynchronously as soon as it is available. What about scripts that aren’t of mime type text/javascript?  A better approach would have been to consume an array of attribute objects and fill those attributes into the created script element.
  4. Loop invariants should be outside the loop especially in a language like JavaScript where linear scope chains determine how quickly a variable can be accessed.  The code negligently accesses global objects and repeatedly goes down nested objects, which it really doesn’t need to per iteration.
  5. XHTML isn’t dead just yet.  Attribute minimization (ref. “<script … defer></script>”) is not valid XHTML.  Language purists may cringe.

Where’s the code?

Here is a non-browser sniffing version that offers the above flexibility.  Please note, as of this time I do not have access to any version of the-browser-that-shall-not-be-named (blah, pottermania), so I’m relying on the community to test this piece of code in the-browser-that-shall-not-be-named.  Enough said.  Here’s my attempt at the code:

/**
 * Loading scripts in parallel.
 * Copyright (C) 2009 Yesudeep Mangalapilly.
 *
 * Licensed under the terms of the MIT License.
 */
function getScripts(scripts){
    var lt = "%3C",
        gt = "%3E",
        doc = document,
        len = scripts.length,
        html = [];

    for (var i = 0, script = null, attribute_string = null; i < len; ++i){
        script = scripts[i];
        switch (typeof script){
        case 'string':
            attribute_string = ['src=\"', decodeURI(script), '\"'].join('');
            break;
        case 'object':
            var attrs = [];
            for (var key in script){
                var value = script[key];
                switch(key){
                    case 'src':
                    case 'SRC':
                        value = decodeURI(value);
                        break;
                    default: break;
                }
                attrs.push([key, '\"' + value + '\"'].join('='));
            }
            attribute_string = attrs.join(' ');
            break;
        default:
            continue;
        }
        html.push(lt, 'script ', attribute_string, gt, lt, '/script', gt);
    }
    html = unescape(html.join(''));
    doc.write(html);
}

Usage

Here is how you can use the code:

var scripts = [
    {
        src: "foobar.js",
        async: true
    },
    "quuxdoo.js"
];

getScripts(scripts);

Hope that helps.  Suggestions and constructive criticism most welcome.

memcache.js for Google App Engine application front-ends?

The Google App Engine datastore is a pretty neat example of a key-value datastore.  Mapping this to JavaScript objects can be fairly straightforward, especially when you need to cache AJAX responses.

An App Engine Model

Let’s say you have a Person model defined in your GAE application:

# models.py
from google.appengine.ext import db
from google.appengine.api import memcache

class Person(db.Model):
    first_name = db.StringProperty()
    last_name = db.StringProperty()
    birthdate = db.DateTimeProperty()

    @classmethod
    def get_all(cls):
        cache_key = 'Person.get_all'
        cached_value = memcache.get(cache_key)
        if cached_value:
            return cached_value
        else:
            people = Person.all().fetch(20)
            memcache.set(cache_key, people, 120)  # cache for 2 minutes/120 seconds
            return people

Getting a list of 20 people (more specifically, Person objects) from the datastore is as easy as calling:

people = Person.get_all()

The Person.get_all() method uses memcache to temporarily cache a copy of the data in distributed memory to avoid hitting the datastore every time it is called.  Of course, as you can see, the data is only cached for a particular duration in memory and then cleared away.

What if you could use memcache on the client side to cache server responses?

You can’t really use the actual memcached daemon for this purpose, but you can surely emulate memcache behavior using a simple JavaScript object to cache values just like memcached would.  Quite naturally, since the code would be restricted to a single script runtime environment, you wouldn’t have distributed memcache either.  But, hey, something is better than nothing.

Caching server responses that result from AJAX calls can make people perceive that your application is pretty quick.  All you’re doing to achieve this is avoiding hitting the Web server repeatedly for the same information.  Let’s look at a code excerpt:

function get_person(key){
  var person = memcache.get(key);
  if (person){
    return person;
  } else {
    remote_api.get_person(key, function(person){
      memcache.set(key, person, 120000);  // Cache for 2 minutes (120 seconds).
    });
  }
}

Note that get_person(key) will not send a request to the server if the data is already available in the memcache store.  In the above case, the data is only cached for 2 minutes, after which any call to get_person() will send a request to the server.

Where’s the code?

/**
 * memcache.js - A simple memcache-like object implemented in JavaScript.
 * Copyright (c) 2009, happychickoo.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or
 * without modification, are permitted provided that the following
 * conditions are met:
 *
 *   * Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer
 *     in the documentation and/or other materials provided with the
 *     distribution.
 *   * Neither the name of happychickoo nor the names of its
 *     contributors may be used to endorse or promote products derived
 *     from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */
this.memcache = {
  datastore: {},
  get: function(key){
    return this.datastore[key];
  },
  set: function(key, value, timeout /* milliseconds */){
    var store = this.datastore;
    if (typeof timeout === 'undefined'){
      timeout = 0;
    }
    store[key] = value;
    if (timeout){
      setTimeout(function(){
        delete store[key];
      }, timeout);
    }
  },
  remove: function(key){
    delete this.datastore[key];
  },
  clear: function(){
    this.datastore = {};
  }
};

There you go.

How does this help with GAE applications?

App Engine doesn’t use a conventional RDBMS to store data.  GAE uses a distributed key-value data store, where every object stored in the database is assigned a unique key that looks something like this:

agttaWxzLXNlY3VyZXIKCxIEVXNlchgdDA

This key is guaranteed to start with an alphabet, which implies you can use it as an identifier for a DOM element. Attaching behaviors to such DOM elements that fetch corresponding data then becomes as easy as doing this (example uses jQuery):

jQuery('ul#people > li').click(function(e){
  var key = jQuery(this).attr('id');
  get_person(key, function(person){
    show_information(person);
  });
});

Hope that helps clear out some air.  It should be noted that if you’re adding li elements dynamically to the ul#people unordered list, the above event handler will not be called for them.  Instead of using jQuery(…).click(handler), you should consider using jQuery(…).live(‘click’, handler).

Implementing a Pythonic range() function in JavaScript.

Creating a JavaScript array of say, years 2009 to 2050, can be naively done as follows:

var years = [2009, 2010, 2011, 2012, ... 2050];

If you’re lazier (and I don’t mean that’s a bad thing; Perl aficionados admire the virtue of being lazy), you’d do something along the lines of:

var years = [];
for (var year = 2009; year < 2051; year++){
    years.push(year);
}

And you wouldn’t give it much thought. You’d throw that piece into your code and chug along like a good rabbit. JavaScript 1.7 introduces a cool new feature called an array comprehension. I smell Python. Python has a similar construct, called a list comprehension, that must have been some source of inspiration for the JavaScript standardizing committee. Anyway, here’s what an array comprehension allows you to do:

var years = [year for each (year in range(2009, 2051))];

Sweet. One line that does all of that. But wait. JavaScript doesn’t come with a range() method, so you can very naively implement one using another cool new feature that comes with JavaScript 1.7: generators. We’ll get into much more detail about those in another article perhaps, but here’s an oversimplified explanation about what they are. They’re your usual functions with an orange-y twist. You can pause a generator function right in the middle using the yield keyword and have it give you a value, finish whatever you want to do with that value and ask that generator function to resume where it left off possibly to yield another value to you or run to completion.

function range(begin, end){
    for (let i = begin; i < end; ++i){
        yield i;
    }
}

I cannot possibly take credit for that piece of code because I simply lifted it from the Mozilla Developer Wiki. You may be wondering, why would you ever want to use a generator to do that? Well, the real difference between a generator and an ordinary function that returns a list is the very efficient use of memory. Generators don’t have to return an entire list which may be so long your end-user might simply quit using your application. They just return (more correctly, yield) as much as you need and let you go along your way. Now, generators are shiny and all, but there’s one little problem. What about when you really need that list of numbers, say from 20 to 400? That’s when you would do this:

/**
 * Behaves just like the python range() built-in function.
 * Arguments:   [start,] stop[, step]
 *
 * @start   Number  start value
 * @stop    Number  stop value (excluded from result)
 * @step    Number  skip values by this step size
 *
 * Number.range() -> error: needs more arguments
 * Number.range(4) -> [0, 1, 2, 3]
 * Number.range(0) -> []
 * Number.range(0, 4) -> [0, 1, 2, 3]
 * Number.range(0, 4, 1) -> [0, 1, 2, 3]
 * Number.range(0, 4, -1) -> []
 * Number.range(4, 0, -1) -> [4, 3, 2, 1]
 * Number.range(0, 4, 5) -> [0]
 * Number.range(5, 0, 5) -> []
 *   Number.range(5, 4, 1) -> []
 * Number.range(0, 1, 0) -> error: step cannot be zero
 * Number.range(0.2, 4.0) -> [0, 1, 2, 3]
 */
Number.range = function() {
  var start, end, step;
  var array = [];

  switch(arguments.length){
    case 0:
      throw new Error('range() expected at least 1 argument, got 0 - must be specified as [start,] stop[, step]');
      return array;
    case 1:
      start = 0;
      end = Math.floor(arguments[0]) - 1;
      step = 1;
      break;
    case 2:
    case 3:
    default:
      start = Math.floor(arguments[0]);
      end = Math.floor(arguments[1]) - 1;
      var s = arguments[2];
      if (typeof s === 'undefined'){
        s = 1;
      }
      step = Math.floor(s) || (function(){ throw new Error('range() step argument must not be zero'); })();
      break;
   }

  if (step > 0){
    for (var i = start; i <= end; i += step){
      array.push(i);
    }
  } else if (step < 0) {
    step = -step;
    if (start > end){
      for (var i = start; i > end + 1; i -= step){
        array.push(i);
      }
    }
  }
  return array;
}

And you could now simply do:

var years = Number.range(2009, 2051);

to get a list of years from 2009 to 2050 (inclusive).

Released under the you-may-do-whatever-the-fuck-you-want-with-that-code license.