Parsing the Firefox cache

Write a module ffcache.py that contains a set of functions for reading Firefox cache data into useful data structures that can be used by other programs.

Capture all available information including URLs, timestamps, sizes, locations, content types, etc.

Important notice

This is a large example that we build in parts. The solution code below is the complete source code for the solution which will be discussed part by part. Place this solution into a file ffcache.py and continue on to examples that follow below.

Solution

# ffcache.py
#
# Functions for extracting meta-data from the Firefox browser cache

#-------------------------------------------------------------------
# Part 1

import struct

# This function parses a cache metadata header into a dict
# of named fields (listed in _headernames below)

_headernames = ['magic','location','fetchcount',
                'fetchtime','modifytime','expiretime',
                'datasize','requestsize','infosize']

def parse_meta_header(headerdata):
    head = struct.unpack(">9I",headerdata)    
    meta = dict(zip(_headernames,head))
    return meta

#-------------------------------------------------------------------
# Part 2

import re
part_pat = re.compile(r'[\n\r -~]*$')

def parse_request_data(meta,requestdata):
    parts = requestdata.split('\x00')
    for part in parts:
        if not part_pat.match(part):
            return False

    request = parts[0]
    if len(request) != (meta['requestsize'] - 1):
        return False
    
    info = dict(zip(parts[1::2],parts[2::2]))
    meta['request'] = request.split(':',1)[1]
    meta['info'] = info
    return True

#-------------------------------------------------------------------
# Part 3

# Given a metadata dictionary, this function adds additional
# fields related to the content type, charset, and encoding

import email
def add_content_info(meta):
    info = meta['info']
    if 'response-head' not in info: 
        meta['content-type'] = 'unknown'
        meta['content-encoding'] = None
        meta['charset'] = ''
    else:
        rhead = info.get('response-head').split("\n",1)[1]
        m = email.message_from_string(rhead)
        content  = m.get_content_type()
        encoding = m.get('content-encoding',None)
        charset  = m.get_content_charset()
        meta['content-type'] = content
        meta['content-encoding'] = encoding
        meta['charset'] = charset

#-------------------------------------------------------------------
# Part 4

# Scan a single file in the firefox cache
def scan_cachefile(f,blocksize):
    maxsize = 4*blocksize    # Maximum size of an entry
    f.seek(4096)             # Skip the bit-map
    while True:
        headerdata = f.read(36)
        if not headerdata: break
        meta = parse_meta_header(headerdata)
        if (meta['magic'] == 0x00010008 and 
            meta['requestsize'] + meta['infosize'] < maxsize):
               requestdata = f.read(meta['requestsize']+
                                    meta['infosize'])
               if parse_request_data(meta,requestdata): 
                    add_content_info(meta)
                    yield meta

        # Move the file pointer to the start of the next block
        fp = f.tell()
        if (fp % blocksize):
            f.seek(blocksize - (fp % blocksize),1)


#-------------------------------------------------------------------
# Part 5

# Given the name of a Firefox cache directory, the function
# scans all of the _CACHE_00n_ files for metadata. A sequence
# of dictionaries containing metadata is returned.

import os
def scan_cache(cachedir):
    files = [('_CACHE_001_',256),
             ('_CACHE_002_',1024),
             ('_CACHE_003_',4096)]

    for cname,blocksize in files:
        cfile = open(os.path.join(cachedir,cname),"rb")
        for meta in scan_cachefile(cfile,blocksize):
            meta['cachedir'] = cachedir
            meta['cachefile'] = cname
            yield meta
        cfile.close()


#-------------------------------------------------------------------
# Part 6

# scan an entire list of cache directories producing
# a sequence of records

def scan(cachedirs):
    if isinstance(cachedirs,str):
        cachedirs = [cachedirs]
    for cdir in cachedirs:
        for meta in scan_cache(cdir):
            yield meta


Example use

To test out the above module, create a file ffex.py that starts with this code:
# ffex.py
import ffcache
import os

# Hard-wired location of some cache directory  (Will depend on your machine)
cachedir = 'C:/Documents and Settings/David Beazley/Local Settings' \
           '/Application Data/Mozilla/Firefox/Profiles/f60qcxvb.default/Cache'
Now, add these code fragments as we go to see how things work. Please experiment more on your own.

Example 1 : Reading a meta-data header

f = open(os.path.join(cachedir,"_CACHE_001_"),"rb")
f.seek(4096)
headerdata = f.read(36)
meta = ffcache.parse_meta_header(headerdata)
print meta

Example 2: Parsing Request data

requestdata = f.read(meta['requestsize']+meta['infosize'])
ffcache.parse_request_data(meta,requestdata)
print meta

print meta['request']
print meta['info']

Example 3: Adding content info

ffcache.add_content_info(meta)
print meta['content-type']
print meta['content-encoding']
print meta['charset']

Example 4: Scanning a single cache file

f = open(os.path.join(cachedir,"_CACHE_001_"),"rb")
for meta in ffcache.scan_cachefile(f,256):
    print meta['request']

Example 5: Scanning an entire directory

for meta in ffcache.scan_cache(cachedir):
    print meta['request']
Here are some sample queries you can try:
# Find all requests related to slashdot
for meta in ffcache.scan_cache(cachedir):
    if 'slashdot' in meta['request']:
        print meta['request']


# Find all large JPEG images
jpegs = (meta for meta in ffcache.scan_cache(cachedir)
              if meta['content-type'] == 'image/jpeg'
              and meta['datasize'] > 100000)

for j in jpegs:
    print j['request']