# HTML Outliner 0.5 # # See the tests at the bottom for usage examples # # Copyright Christoffer Sawicki 2006-2008 # Licensed under the same terms as Ruby # # Please send bug reports and improvements to # christoffer.sawicki@gmail.com # # Canonical source # http://code.vemod.net/svn/misc/hpricot_goodies/html_outliner.rb # # Noteworthy changes # * 0.5: Should now handle malformed HTML gracefully. # * 0.4: An element can only have one id, # so pass the current id to the slugifier. require "hpricot" class SimpleTree attr_accessor :root, :subtrees def initialize(root, subtrees = []) @root = root @subtrees = subtrees end end class HTMLOutliner def initialize(hpricot_doc) @doc = hpricot_doc end def add_header_anchors!(slugifier = method(:default_slugifier)) headers.each_with_index do |header, index| header["id"] = slugifier.call(header.inner_html, index, header["id"]) end end def default_slugifier(string, index, current_id) # Just overwrite the current id "section-" + string.downcase.gsub(/\s+/, "_") end def headers find_headers(@doc.children) end def outline tree_stack = [] headers.inject([]) do |result, header| new_tree = SimpleTree.new(header) until tree_stack.empty? || tree_stack.last.root.name < header.name tree_stack.pop end if tree_stack.empty? tree_stack.push(new_tree) next(result + [ new_tree ]) else tree_stack.last.subtrees.push(new_tree) tree_stack.push(new_tree) next(result) end end end private def find_headers(nodes) nodes.inject([]) do |sum, node| if node.is_a?(Hpricot::Text) || node.is_a?(Hpricot::BogusETag) sum elsif node.name.match(/^h[1-6]$/i) sum + [ node ] + find_headers(node.children) else sum + find_headers(node.children) end end end end if __FILE__ == $0 require "test/unit" class HTMLOutlinerTest < Test::Unit::TestCase def test_construction_with_root assert_nothing_raised do HTMLOutliner.new(Hpricot("")).headers end end def test_construction_without_root assert_nothing_raised do HTMLOutliner.new(Hpricot("")).headers end end def test_headers input = <<-END

Hello

World

END doc = Hpricot(input) outliner = HTMLOutliner.new(doc) assert_equal 2, outliner.headers.size assert_equal "Hello", outliner.headers[0].inner_html assert_equal "World", outliner.headers[1].inner_html end def test_add_anchors input = <<-END

Hello

World

END expected_output = <<-END

Hello

World

END doc = Hpricot(input) outliner = HTMLOutliner.new(doc) outliner.add_header_anchors! assert_equal(expected_output, doc.to_s) end def test_add_anchors_with_existing_id input = <<-END

Hello

END expected_output = <<-END

Hello

END doc = Hpricot(input) outliner = HTMLOutliner.new(doc) outliner.add_header_anchors! assert_equal(expected_output, doc.to_s) end def test_add_anchors_with_custom_slugifier input = <<-END

Hello

END expected_output = <<-END

Hello

END doc = Hpricot(input) outliner = HTMLOutliner.new(doc) slugifier = lambda { |x, i, current_id| "chapter#{i + 1}" } outliner.add_header_anchors!(slugifier) assert_equal(expected_output, doc.to_s) end def test_outline doc = Hpricot(<<-END)

A

B

C

D

E

END outline = HTMLOutliner.new(doc).outline assert_equal 2, outline.size assert_equal 0, outline[0].subtrees.size assert_equal 2, outline[1].subtrees.size assert_equal 1, outline[1].subtrees[0].subtrees.size assert_equal 0, outline[1].subtrees[0].subtrees[0].subtrees.size assert_equal 0, outline[1].subtrees[1].subtrees.size assert_equal "A", outline[0].root.inner_html assert_equal "B", outline[1].root.inner_html assert_equal "C", outline[1].subtrees[0].root.inner_html assert_equal "D", outline[1].subtrees[0].subtrees[0].root.inner_html assert_equal "E", outline[1].subtrees[1].root.inner_html end def test_malformed_html doc = Hpricot(<<-END)

Uh!

B

C

END assert_nothing_raised do HTMLOutliner.new(doc).headers end assert_nothing_raised do HTMLOutliner.new(doc).outline end end end end