All this crawler does is take a seed blog (my blog) URL, run through all the links in its front page and store the ones that look like a blogpost URL. I assume that all the blogs in this world are linked to on at least one other blog. Thus all of them will get indexed in this world if the spider is given enough time ad memory.
This is the code for the crawler. Its in Python and is quite easy. Please run through it and let me know if there is any other way to optimise it further:
import sys
import re
import urllib2
import urlparse
from pysqlite2 import dbapi2 as sqlite
conn = sqlite.connect('/home/spider/blogSearch.db')
cur = conn.cursor()
tocrawltpl = cur.execute('SELECT * FROM blogList where key=1')
for row in tocrawltpl:
tocrawl = set([row[1]])
linkregex = re.compile("")
while 1:
        try:
                crawling = tocrawl.pop()
        except KeyError:
                raise StopIteration
        url = urlparse.urlparse(crawling)
        try:
                response = urllib2.urlopen(crawling)
        except:
                continue
        msg = response.read()
        links = linkregex.findall(msg)
        for link in (links.pop(0) for _ in xrange(len(links))):
                if link.endswith('.blogspot.com/'):
                        if link.startswith('/'):
                                link = 'http://' + url[1] + link
                        elif link.startswith('#'):
                                link = 'http://' + url[1] + url[2] + link
                        elif not link.startswith('http'):
                                link = 'http://' + url[1] + '/' + link
                        select_query='SELECT * FROM blogList where url="%s"' %link
                        crawllist = cur.execute(select_query)
                        flag=1
                        for row in crawllist:
                                flag=0
                        if flag:
                                tocrawl.add(link)
                                insert_query='INSERT INTO blogList (url) VALUES ("%s")' %link
                                cur.execute(insert_query)
                                conn.commit()
import re
import urllib2
import urlparse
from pysqlite2 import dbapi2 as sqlite
conn = sqlite.connect('/home/spider/blogSearch.db')
cur = conn.cursor()
tocrawltpl = cur.execute('SELECT * FROM blogList where key=1')
for row in tocrawltpl:
tocrawl = set([row[1]])
linkregex = re.compile("")
while 1:
        try:
                crawling = tocrawl.pop()
        except KeyError:
                raise StopIteration
        url = urlparse.urlparse(crawling)
        try:
                response = urllib2.urlopen(crawling)
        except:
                continue
        msg = response.read()
        links = linkregex.findall(msg)
        for link in (links.pop(0) for _ in xrange(len(links))):
                if link.endswith('.blogspot.com/'):
                        if link.startswith('/'):
                                link = 'http://' + url[1] + link
                        elif link.startswith('#'):
                                link = 'http://' + url[1] + url[2] + link
                        elif not link.startswith('http'):
                                link = 'http://' + url[1] + '/' + link
                        select_query='SELECT * FROM blogList where url="%s"' %link
                        crawllist = cur.execute(select_query)
                        flag=1
                        for row in crawllist:
                                flag=0
                        if flag:
                                tocrawl.add(link)
                                insert_query='INSERT INTO blogList (url) VALUES ("%s")' %link
                                cur.execute(insert_query)
                                conn.commit()