diff options
author | joey <joey@0fa5a96a-9a0e-0410-b3b2-a0fd24251071> | 2006-10-28 03:27:10 +0000 |
---|---|---|
committer | joey <joey@0fa5a96a-9a0e-0410-b3b2-a0fd24251071> | 2006-10-28 03:27:10 +0000 |
commit | 49bf877701d89d613dcf5c2d85bd08876a636dba (patch) | |
tree | d28ec4df6277b2dcf8dbcd7aac5dc63215d9618a | |
parent | 05fe79b4872547997b4e54cf6965743b7fbf6e57 (diff) |
* Add a separate pass to find page links, and only render each page once,
instead of over and over. This is up to 8 times faster than before!
(This could have introduced some subtle bugs, so it needs to be tested
extensively.)
-rw-r--r-- | IkiWiki/Render.pm | 64 | ||||
-rw-r--r-- | debian/changelog | 9 | ||||
-rw-r--r-- | doc/roadmap.mdwn | 5 | ||||
-rw-r--r-- | doc/todo/optimisations.mdwn | 36 |
4 files changed, 64 insertions, 50 deletions
diff --git a/IkiWiki/Render.pm b/IkiWiki/Render.pm index deec539ae..026b3582e 100644 --- a/IkiWiki/Render.pm +++ b/IkiWiki/Render.pm @@ -13,6 +13,7 @@ sub backlinks ($) { #{{{ my @links; foreach my $p (keys %links) { next if bestlink($page, $p) eq $page; + if (grep { length $_ && bestlink($p, $_) eq $page } @{$links{$p}}) { my $href=abs2rel(htmlpage($p), dirname($page)); @@ -119,21 +120,25 @@ sub mtime ($) { #{{{ return (stat($file))[9]; } #}}} -sub findlinks ($$) { #{{{ - my $page=shift; - my $content=shift; +sub scan ($) { #{{{ + my $file=shift; - my @links; - while ($content =~ /(?<!\\)$config{wiki_link_regexp}/g) { - push @links, titlepage($2); - } - if ($config{discussion}) { - # Discussion links are a special case since they're not in the - # text of the page, but on its template. - return @links, "$page/discussion"; - } - else { - return @links; + my $type=pagetype($file); + if (defined $type) { + my $srcfile=srcfile($file); + my $content=readfile($srcfile); + my $page=pagename($file); + + my @links; + while ($content =~ /(?<!\\)$config{wiki_link_regexp}/g) { + push @links, titlepage($2); + } + if ($config{discussion}) { + # Discussion links are a special case since they're not in the + # text of the page, but on its template. + push @links, "$page/discussion"; + } + $links{$page}=\@links; } } #}}} @@ -149,9 +154,6 @@ sub render ($) { #{{{ will_render($page, htmlpage($page), 1); $content=filter($page, $content); - - $links{$page}=[findlinks($page, $content)]; - $content=preprocess($page, $page, $content); $content=linkify($page, $page, $content); $content=htmlize($page, $type, $content); @@ -162,7 +164,6 @@ sub render ($) { #{{{ } else { my $content=readfile($srcfile, 1); - $links{$file}=[]; delete $depends{$file}; will_render($file, $file, 1); writefile($file, $config{destdir}, $content, 1); @@ -238,9 +239,8 @@ sub refresh () { #{{{ foreach my $file (@files) { my $page=pagename($file); if (! $oldpagemtime{$page}) { - debug("new page $page") unless exists $pagectime{$page}; push @add, $file; - $links{$page}=[]; + scan($file); $pagecase{lc $page}=$page; $pagesources{$page}=$file; if ($config{getctime} && -e "$config{srcdir}/$file") { @@ -256,6 +256,7 @@ sub refresh () { #{{{ if (! $exists{$page}) { debug("removing old page $page"); push @del, $pagesources{$page}; + $links{$page}=[]; $renderedfiles{$page}=[]; $oldpagemtime{$page}=0; prune($config{destdir}."/".$_) @@ -264,6 +265,18 @@ sub refresh () { #{{{ } } + # scan updated files to update info about them + foreach my $file (@files) { + my $page=pagename($file); + + if (! exists $oldpagemtime{$page} || + mtime(srcfile($file)) > $oldpagemtime{$page} || + $forcerebuild{$page}) { + debug("scanning $file"); + scan($file); + } + } + # render any updated files foreach my $file (@files) { my $page=pagename($file); @@ -278,12 +291,10 @@ sub refresh () { #{{{ } # if any files were added or removed, check to see if each page - # needs an update due to linking to them or inlining them. - # TODO: inefficient; pages may get rendered above and again here; - # problem is the bestlink may have changed and we won't know until - # now + # needs an update due to linking to them or inlining them if (@add || @del) { FILE: foreach my $file (@files) { + next if $rendered{$file}; my $page=pagename($file); foreach my $f (@add, @del) { my $p=pagename($f); @@ -301,11 +312,9 @@ FILE: foreach my $file (@files) { # Handle backlinks; if a page has added/removed links, update the # pages it links to. Also handles rebuilding dependant pages. - # TODO: inefficient; pages may get rendered above and again here; - # problem is the backlinks could be wrong in the first pass render - # above if (%rendered || @del) { foreach my $f (@files) { + next if $rendered{$f}; my $p=pagename($f); if (exists $depends{$p}) { foreach my $file (keys %rendered, @del) { @@ -347,6 +356,7 @@ FILE: foreach my $file (@files) { foreach my $link (keys %linkchanged) { my $linkfile=$pagesources{$link}; if (defined $linkfile) { + next if $rendered{$linkfile}; debug("rendering $linkfile, to update its backlinks"); render($linkfile); $rendered{$linkfile}=1; diff --git a/debian/changelog b/debian/changelog index 24f0e3886..e914d40b3 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,12 @@ +ikiwiki (1.32) UNRELEASED; urgency=low + + * Add a separate pass to find page links, and only render each page once, + instead of over and over. This is up to 8 times faster than before! + (This could have introduced some subtle bugs, so it needs to be tested + extensively.) + + -- Joey Hess <joeyh@debian.org> Fri, 27 Oct 2006 23:21:35 -0400 + ikiwiki (1.31) unstable; urgency=low * Patch from Pawel Tecza to cp -a the templates in the Makefile. diff --git a/doc/roadmap.mdwn b/doc/roadmap.mdwn index 7a3193308..b393f254f 100644 --- a/doc/roadmap.mdwn +++ b/doc/roadmap.mdwn @@ -14,12 +14,11 @@ Released 29 April 2006. * Unit test suite (with tests of at least core stuff like [[PageSpec]]). _(status: exists, could of course use more tests)_ -* [[Plugins]] _(status: done, interface still not [[quite_stable|todo/firm_up_plugin_interface]])_ +* [[Plugins]] _(status: done)_ * [[Tags]] _(status: fair)_ * Should have fully working [[todo/utf8]] support. _(status: good)_ * [[Optimised_rendering|todo/optimisations]] if possible. Deal with other - scalability issues. _(status: 45%-60%+ speedup since 1.0, much more - possible)_ + scalability issues. _(status: something like 9x speedup 1.0!)_ * Improved [[todo/html]] stylesheets and templates. * Improved scalable [[logo]]. _(status: done)_ * Support for at other revision control systems aside from svn. diff --git a/doc/todo/optimisations.mdwn b/doc/todo/optimisations.mdwn index 4e8118756..13a270b8f 100644 --- a/doc/todo/optimisations.mdwn +++ b/doc/todo/optimisations.mdwn @@ -1,25 +1,21 @@ -* Render each changed page only once. Currently pages are rendered up to 4 - times in worst case (8 times if there's an rss feed). - - The issue is that rendering a page is used to gather info like the links - on the page (and other stuff) that can effect rendering other pages. So it - needs a multi-pass system. But rendering the whole page in each pass is - rather obscene. - - It would be better to have the first pass be a data gathering pass. Such - a pass would still need to load and parse the page contents etc, but - wouldn't need to generate html or write anything to disk. - - One problem with this idea is that it could turn into 2x the work in - cases where ikiwiki currently efficiently renders a page just once. And - caching between the passes to avoid that wouldn't do good things to the - memory footprint. - - Might be best to just do a partial first pass, getting eg, the page links - up-to-date, and then multiple, but generally fewer, rendering passes. - * Don't render blog archive pages unless a page is added/removed. Just changing a page doesn't affect the archives as they show only the title. * Look at splitting up CGI.pm. But note that too much splitting can slow perl down. + +* The backlinks code turns out to scale badly to wikis with thousands of + pages. The code is O(N^2)! It's called for each page, and it loops + through all the pages to find backlinks. + + Need to find a way to calculate and cache all the backlinks in one pass, + which could be done in at worst O(N), and possibly less (if they're + stored in the index, it could be constant time). But to do this, there + would need to be a way to invalidate or update the cache in these + situations: + + - A page is added. Note that this can change a backlink to point to + the new page instead of the page it pointed to before. + - A page is deleted. This can also change backlinks that pointed to that + page. + - A page is modified. Links added/removed. |