攻略サイトをねこぞぎ奪う
こんなん作った。かなり手抜きだけど
require 'net/http' require 'socket' require 'URL.rb' exit if ARGV.length != 1 def mkdir(dir) Dir.mkdir(dir) unless File.exist?(dir) end def parse_html(url) puts url.to_s tokens = [] begin Net::HTTP.start(url.domain, 80) { |http| str = "" response = http.get(url.site) response.body.split(/([<>])/m).each { |t| case t when /</ tokens.push(str) if str != "" str = t when />/ tokens.push(str + t) str = "" else str += t end } tokens.push(str) if str != "" tokens } rescue puts "[" + url.to_s + "]取得失敗..." end end def put_html(url, tokens) anchor = [] open(url.filename, "w") { |f| tokens.each { |token| case token when /<link/i elements = parse_tag(token) if elements["rel"] == "stylesheet" resource_url = url.make_url(elements["href"]) load_resource(resource_url, "css") elements["href"] = "./css/" + resource_url.filename f.print "<link" + make_tag(elements) + ">" else f.print token end when /<img/i elements = parse_tag(token) resource_url = url.make_url(elements["src"]) # 同一domainのときだけ、リソースを取得する if resource_url.domain == url.domain load_resource(resource_url, "img") elements["src"] = "./img/" + resource_url.filename f.print "<img" + make_tag(elements) + ">" end when /<a/i elements = parse_tag(token) if url.level < 1 && elements.include?("href") anchor_url = url.make_url(elements["href"]) if anchor_url.domain == url.domain anchor.push(anchor_url) elements["href"] = anchor_url.filename f.print "<a" + make_tag(elements) + ">" else f.print token end else f.print token end else f.print token end } } anchor end def parse_tag(str) elements = {} str.scan(/\w+\s*=['"].+?['"]/).each { |token| elements[token.split("=")[0].strip.downcase] = token.split("=")[1].gsub(/['"]/, "").strip } elements end def make_tag(elements) str = "" elements.each { |key, value| str += " #{key}=\"#{value}\"" } str end def load_resource(url, type) open((type.nil?? "" : "./#{type}/") + url.filename, "wb") { |f| Net::HTTP.start(url.domain, 80) { |http| f.print http.get(url.site).body } } end Net::HTTP.version_1_2 # おまじない urls = [URL.new(ARGV[0])] mkdir("img") mkdir("css") urls.each { |url| tokens = parse_html(url) urls.push(put_html(url, tokens)) urls.flatten!; urls.compact! }
URLクラスはこんなんです
class URL def initialize(url, level = 0) @level = level url = url[("http://".length)..-1] if url.length >= "http://".length && url =~ /^http:\/\// @str = url.split(/\//, -1) @str.push("") if @str.length == 1 @str[-1] = @str[-1].split(/\?/)[0] if @str[-1].include?("?") end def domain @str[0] end def site "/" + @str[1..-1].join("/") end def filename if @str[-1] == "" "index.html" else @str[-1] end end def level @level end def make_url(url) case url # 完全URL when /^http:\/\// URL.new(url, self.level + 1) # 絶対URL when /^\// URL.new(self.domain + url, self.level + 1) # 相対URL else URL.new(@str[0...-1].join("/") + "/" + url, self.level + 1) end end def to_s @str.join("/") end end