Javascript Compression, Madness
A couple days ago I wrote a post about how to do the n-body simulation in javascript with canvas. But if I wanted to submit it to js1k, it needed to fit into 1024 bytes. The following post is about my own javascript compressor written in python. I start with 3.1kb then get with minification to 1.4kb and with my own compression algorithm to 0.9kb.
If you're already familiar with yui-compressor and tweaking a bit afterwards, you might want to skip directly to Step 3 where I explain how I built the self-deflating javascript code.
ls -l gravity.js -rw-r--r-- 1 pyalot pyalot 3253 2010-08-25 13:48 gravity.js
So the uncompressed version of the code I want to submit has a size of 3253 bytes. Ouch, far larger than 1024 bytes.
Download
You can download the source for the mad compressor if you want to toy with it yourself
Step 1 - YUI Compressor
This is a pretty nifty javascript compressor, it parses javascript and does all kinds of optimizations. Running my code trough this:
yui-compressor gravity.js > script.js ls -l script.js -rw-r--r-- 1 pyalot pyalot 1800 2010-08-25 15:59 script.js
Aha, that's better, I'm down to 1800 bytes. So lets look at the output and see if there's room for improvement.
function Vector(a,b){this.x=a;this.y=b;this.sub=function(c) {return new Vector(this.x-c.x,this.y-c.y)};this.isub=functi on(c){this.x-=c.x;this.y-=c.y};this.iadd=function(c){this.x +=c.x;this.y+=c.y};this.size=function(){return Math.sqrt(th is.x*this.x+this.y*this.y)};this.idiv=function(c){this.x/=c ;this.y/=c};this.zero=function(){this.x=this.y=0};this.vali date=function(){if(isNaN(this.x+this.y)){this.x=this.y=0}}} function Particle(a){this.acceleration=new Vector(0,0);this .velocity=new Vector(Math.random()-0.5,Math.random()-0.5);t his.position=new Vector(Math.random()*a.width,Math.random() *a.height);this.step=function(b){this.acceleration.validate ();this.velocity.iadd(this.acceleration);speed=this.velocit y.size();if(speed>4){this.velocity.idiv(speed/4);speed=4}th is.position.iadd(this.velocity);this.acceleration.zero();if (this.position.x<0){this.position.x=0;this.velocity.x/=-2}e lse{if(this.position.x>a.width){this.position.x=a.width;thi s.velocity.x/=-2}}if(this.position.y<0){this.position.y=0;t his.velocity.y/=-2}else{if(this.position.y>a.height){this.p osition.y=a.height;this.velocity.y/=-2}}color=Math.round(sp eed*128);b.fillStyle="rgba("+(color-50)+","+(300-color)+",0 ,1)";b.beginPath();b.arc(this.position.x,this.position.y,sp eed+2,0,Math.PI*2,false);b.fill()}}gc="globalCompositionOpe ration";new (function System(){var a=document.getElementByI d("c");a.width=a.height=300;var c=a.getContext("2d");var d= [];for(var b=0;b<30;b++){d.push(new Particle(a))}setInterva l(function(){c[gc]="source-over";c.fillStyle="rgba(0,0,0,0. 2)";c.fillRect(0,0,a.width,a.height);c[gc]="lighter";for(va r l=0,g=30;l<g;l++){var f=d[l];for(var h=l+1;h<30;h++){var e=d[h];var k=f.position.sub(e.position);k.idiv(Math.pow(Mat h.max(10,k.size()),3)/5);e.acceleration.iadd(k);f.accelerat ion.isub(k)}f.step(c)}},40)});
It did a fairly good job, but there's a couple things that could be more compact, like class names for Vector/Particles and a bunch of method names.
Step 2 - Compact some more
There's a few fairly simple patterns I can kill with a regular expression. Of course it's not safe to do so generally, but for short code written to be compressed, it's ok. So I wrote a short script to cleanup some more:
#!/usr/bin/env python import re, sys method_re = re.compile(r'\.([a-zA-Z]+)') objects_re = re.compile('new ([a-zA-Z]+)') blacklist = [ 'length', 'random', 'getElementById', 'getContext', 'beginPath', 'arc', 'fill', 'push', 'globalCompositeOperation', 'fillStyle', 'pos', 'pow', 'sqrt', 'width', 'height', 'PI', 'fillRect', 'log', 'join', 'round', 'min', 'max', ] if __name__ == '__main__': source = sys.stdin.read() # replace method names with short symbols method_names = 'abcdefghijklmnopqrstuvwxyz' methods = set() for match in method_re.finditer(source): method = match.group(1) if method not in blacklist and len(method) > 1: methods.add(method) i = 0 for method in methods: source = source.replace(method, method_names[i]) i += 1 # replace obvious objectnames with symbols objects = set() for match in objects_re.finditer(source): name = match.group(1) objects.add(name) object_names = 'abcdefghijklmnopqrstuvwxyz'.upper() i = 0 for name in objects: source = source.replace(name, object_names[i]) i += 1 print source
What this does is look out for obvious method names and if they're not in the method blacklist, replace them with a one character code. Then it looks for obvious object names, and replaces them with a one character code. If you write fairly clean code, that kind of substitution is reasonably safe, well, for a 1024 byte demo anyway.
So let's see what I got:
yui-compressor gravity.js | ./compact > script.js ls -l script.js -rw-r--r-- 1 pyalot pyalot 1462 2010-08-25 16:05 script.js
Yeah, that's not much better, but I got to 1462 bytes, shaved off 338 bytes. Better then nothing, but still too large.
Let's have a look at the output again and see if we missed anything:
function A(a,b){this.x=a;this.y=b;this.c=function(c){return new A(this.x-c.x,this.y-c.y)};this.b=function(c){this.x-=c .x;this.y-=c.y};this.j=function(c){this.x+=c.x;this.y+=c.y} ;this.k=function(){return Math.sqrt(this.x*this.x+this.y*th is.y)};this.e=function(c){this.x/=c;this.y/=c};this.f=funct ion(){this.x=this.y=0};this.i=function(){if(isNaN(this.x+th is.y)){this.x=this.y=0}}}function B(a){this.a=new A(0,0);th is.h=new A(Math.random()-0.5,Math.random()-0.5);this.g=new A(Math.random()*a.width,Math.random()*a.height);this.d=func tion(b){this.a.i();this.h.j(this.a);speed=this.h.k();if(spe ed>4){this.h.e(speed/4);speed=4}this.g.j(this.h);this.a.f() ;if(this.g.x<0){this.g.x=0;this.h.x/=-2}else{if(this.g.x>a. width){this.g.x=a.width;this.h.x/=-2}}if(this.g.y<0){this.g .y=0;this.h.y/=-2}else{if(this.g.y>a.height){this.g.y=a.hei ght;this.h.y/=-2}}color=Math.round(speed*128);b.fillStyle=" rgba("+(color-50)+","+(300-color)+",0,1)";b.beginPath();b.a rc(this.g.x,this.g.y,speed+2,0,Math.PI*2,false);b.fill()}}g c="globalComgOperation";new (function System(){var a=docume nt.getElementById("c");a.width=a.height=300;var c=a.getCont ext("2d");var d=[];for(var b=0;b<30;b++){d.push(new B(a))}s etInterval(function(){c[gc]="source-over";c.fillStyle="rgba (0,0,0,0.2)";c.fillRect(0,0,a.width,a.height);c[gc]="lighte r";for(var l=0,g=30;l<g;l++){var f=d[l];for(var h=l+1;h<30; h++){var e=d[h];var k=f.g.c(e.g);k.e(Math.pow(Math.max(10,k .k()),3)/5);e.a.j(k);f.a.b(k)}f.d(c)}},40)});
Nope, doesn't look like missing much. So what now?
Step 3 - I'm not responsible for your sanity
Preamble: everything that follows now is completely useless for general use except for 1k demo competitions and the like.
The code does still contain sequences that are repeated fairly often, like "this." or "function " etc. I can't minify those with the usual compressors because they're javascript keywords. But if I had some mechanism to shrink them...
What I need is an actual compression algorithm. The problem with those is that they usually build symbol tables and use multi-byte codes to index it. I don't have space for that kind of thing. But I do need a symbol table and indices into it of some sort.
Free characters?
Let's see if there's any ascii unused characters for the code.
def find_free(data): exclude = set('\n\\\r\t\x0b\x0c\'"') candidates = set(string.printable) - exclude return candidates - set(data) if __name__ == '__main__': data = sys.stdin.read().strip() data = data.replace('"', "'").replace('\n', '\\n') keys = find_free(data) print len(keys), keys
I'm excluding line feed (\n), carriage return (\r), vertical tab (x0b) double quotation mark (") and form feed new page (x0c) because those don't work in javascript strings without escaping. I'm excluding tab (\t) because I want to use that later. So indeed, there are 32 characters unused!
!#%$&769:?@DGFHKJLQUTWVYXZ_^`z|~
Pattern matching
If I could replace common patterns in the code with these characters, I could save space. I need to find common patterns, so let's enumerate all patterns the code has.
def window_iter(data, length): for i in range(len(data) - length): yield data[i:i+length]
This is a sliding window iterator, if you where to call with this code:
for window in window_iter(data, 10): print window
You would get something like this:
&h.xLelse{
h.xLelse{i
.xLelse{if
xLelse{if(
Lelse{if(9
else{if(9x
lse{if(9x>
se{if(9x>:
e{if(9x>:$
...
So if I count the occurrences of the pattern in the source and calculate occurrence * len(pattern), I know which one is best. This is what the find_best function does.
def find_best(data): symbols = {} for size in range(2, 100): hit = False for window in window_iter(data, size): count = data.count(window) if count > 1: hit = True symbols[window] = count * size if not hit: break return sorted(symbols, key=symbols.__getitem__)[-1]
Actual compression
So I replace all these best occurrences successively until no more keys are left:
if __name__ == '__main__': data = sys.stdin.read().strip() data = data.replace('"', "'").replace('\n', '\\n') keys = find_free(data) symbols = [] while keys: sequence = find_best(data) key = keys.pop() data = data.replace(sequence, key) symbols.insert(0, key+sequence)
Putting it back together
But now I have a symbol table with key/sequence relationships and a chunk of unreadable text. I need a javascript algorithm that can decompress it.
symbols = "the symbol table".split("\t"); data = "the remaining data"; do{ previous = data.length; for(i=0; i<symbols.length; i++){ symbol = symbols[i]; key = symbol[0]; sequence = symbol.substr(1,99); data = data.replace(key, sequence); } } while(previous != data.length) eval(data);
This algorithm expands each symbol in the data string with the sequence from the table for that key, until data does not expand anymore.
So I need to wrap my symbol table and data into that template, and the template needs to be pretty short too:
def wrap(symbols, data): template = '''\ s="%(symbols)s".split("%(symsep)s");d="%(data)s";\ do{p=d.length;for(i=0;i<s.length;i++){q=s[i];\ d=d.replace(q[0],q.substr(1,99))}}while(p!=d.length)\ eval(d)''' return template % { 'symsep' : '\t', 'data' : data, 'symbols' : '\t'.join(symbols), }
And finally the compressed code must be output to stdout:
print wrap(symbols, data)
This can't possibly work or can it?
There's about 120 bytes additional code for the decompressor and the symbol table format has two more bytes for each of the 32 symbols. So doing some math, 1024-120-64 = 872 bytes. I need a compression before serialization to 57% of its original size, and just with 32 symbols. Did it work?
yui-compressor gravity.js | ./compact | ./jszip > script.js -rw-r--r-- 1 pyalot pyalot 1012 2010-08-25 17:25 script.js
Yes! It does work, I'm down to 1012 bytes! If you care to check it out you can see that it's also valid javascript after decompression.
How does the code look now? Weird.
s="~=Yx&y |a. zif( `Kc$J ^=HA( _/=-2}} Zc[gc]=' X)} Yc. V&h. Wcolor T$return U$J=!y=0} Q;for(6 L0 , J!x K=#( Hnew FMath. G); D/=-2}else{if(? @speed ? !g. :.fillStyle='rgba( 9a.width 6var 7a.height &;! $){ %Math.random() #function !this.".split(" ");d="# A(a,b$J=a&y=b&cKcTHA(J-Yx,!y-YyX&b`-~-=Yy}&j`+~+=Yy}&kKTFs qrt(J*J+!y*!yX&e`/=c&y/=c}&fKU&iK$zisNaN(J+!y)U}}# B(a$!a^L 0)&h^%-0.5,%-0.5)&g^%*9,%*7)&dKb$!|i()Vj(!aG@=!h.k(Gz@>4$!h .e(@/4G@=4}?j(!h)&|f(Gz?x<0$?x=0VxDx>9$?x=9Vx_z?y<0$?y=0VyD y>7$?y=7Vy_W=Fround(@*128Gb:'+(W-50)+','+(300-W)+',L1)';b.b eginPath(Gb.arc(?x,?y,@+2,LFPI*2,falseGb.fill(X}gc='globalC omgOperation';H(# System($6a=document.getElementById('c'G9= 7=300;6c=|getContext('2d'G6d=[]Qb=0;b<30;b++$d.push(HB(a)Xs etInterval(#($Zsource-over';c:LLL0.2)';YfillRect(LL9,7GZlig hter'Ql=Lg=30;l<g;l++$6f=d[l]Qh=l+1;h<30;h++$6e=d[h];6k=f.g .c(e.gGk.e(Fpow(Fmax(1Lk.k()),3)/5Ge.|j(kGf.|b(kXf.d(cX},40 XG";do{p=d.length;for(i=0;i<s.length;i++){q=s[i];d=d.replac e(q[0],q.substr(1,99))}}while(p!=d.length)eval(d)
So my js1k submission is away, and I'm having high hopes for honorary mention for the most elaborate compression scheme :)



