Class: Crawline::Engine
- Inherits:
-
Object
- Object
- Crawline::Engine
- Defined in:
- lib/crawline.rb
Instance Method Summary collapse
- #convert_url_to_s3_path(url) ⇒ Object
- #crawl(url) ⇒ Object
- #data_to_json(data) ⇒ Object
- #download_or_redownload(url, parser, data) ⇒ Object
- #find_parser(url) ⇒ Object
- #get_latest_data_from_storage(url) ⇒ Object
-
#initialize(downloader, repo, parsers, interval = 1.0) ⇒ Engine
constructor
A new instance of Engine.
- #json_to_data(json_data) ⇒ Object
- #parse(url) ⇒ Object
- #put_data_to_storage(url, data, related_links) ⇒ Object
Constructor Details
#initialize(downloader, repo, parsers, interval = 1.0) ⇒ Engine
Returns a new instance of Engine
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
# File 'lib/crawline.rb', line 205 def initialize(downloader, repo, parsers, interval = 1.0) @logger = CrawlineLogger.get_logger @logger.debug("Engine#initialize: start: downloader=#{downloader}, repo=#{repo}, parsers=#{parsers}") raise ArgumentError, "downloader is nil." if downloader.nil? raise ArgumentError, "repo is nil." if repo.nil? raise ArgumentError, "parsers is nil." if parsers.nil? raise TypeError, "downloader is not Crawline::Downloader." if not downloader.is_a?(Crawline::Downloader) raise TypeError, "repo is not Crawline::ResourceRepository." if not repo.is_a?(Crawline::ResourceRepository) parsers.each do |url_pattern, parser| raise TypeError, "parsers is not Hash<Regexp, Parser>." if not url_pattern.is_a?(Regexp) raise TypeError, "parsers is not Hash<Regexp, Parser>." if not (parser < Crawline::BaseParser) end @downloader = downloader @repo = repo @parsers = parsers @interval = interval end |
Instance Method Details
#convert_url_to_s3_path(url) ⇒ Object
398 399 400 401 |
# File 'lib/crawline.rb', line 398 def convert_url_to_s3_path(url) path = OpenSSL::Digest::SHA256.hexdigest(url) path = path[0..1] + "/" + path end |
#crawl(url) ⇒ Object
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 |
# File 'lib/crawline.rb', line 226 def crawl(url) @logger.debug("Engine#crawl: start: url=#{url}") url_list = [url] result = { "success_url_list" => [], "fail_url_list" => [], "context" => {} } until url_list.empty? do target_url = url_list.shift @logger.debug("Engine#crawl: target_url=#{target_url}") begin next_links = crawl_impl(target_url, result["context"]) if not next_links.nil? next_links.each do |next_link| url_list << next_link if (not url_list.include?(next_link)) && (not result["success_url_list"].include?(next_link)) && (not result["fail_url_list"].include?(next_link)) end end result["success_url_list"].push(target_url) rescue => err @logger.warn("Engine#crawl: crawl error") @logger.warn(err) result["fail_url_list"].push(target_url) end @logger.info("Engine#crawl: progress: total=#{url_list.size + result["success_url_list"].size + result["fail_url_list"].size}, success=#{result["success_url_list"].size}, fail=#{result["fail_url_list"].size}, remaining=#{url_list.size}") end result end |
#data_to_json(data) ⇒ Object
308 309 310 311 312 313 314 |
# File 'lib/crawline.rb', line 308 def data_to_json(data) json_data = Marshal.load(Marshal.dump(data)) json_data["response_body"] = Base64.urlsafe_encode64(json_data["response_body"]) json_data["downloaded_timestamp"] = json_data["downloaded_timestamp"].to_i json_data.to_json end |
#download_or_redownload(url, parser, data) ⇒ Object
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 |
# File 'lib/crawline.rb', line 338 def download_or_redownload(url, parser, data) @logger.debug("Engine#download_or_redownload: start: url=#{url}, parser=#{parser}, data.nil?=#{data.nil?}") if data.nil? @logger.debug("Engine#download_or_redownload: download") sleep(@interval) new_data = @downloader.download_with_get(url) else parser_instance = parser.new(url, data) if parser_instance.redownload? @logger.debug("Engine#download_or_redownload: redownload") sleep(@interval) new_data = @downloader.download_with_get(url) else @logger.debug("Engine#download_or_redownload: skip") nil end end end |
#find_parser(url) ⇒ Object
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 |
# File 'lib/crawline.rb', line 292 def find_parser(url) @logger.debug("Engine#find_parser: start: url=#{url}") parser = @parsers.find do |url_pattern, clazz| url_pattern.match(url) end @logger.debug("Engine#find_parser: parser=#{parser}") if parser.nil? @logger.debug("Engine#find_parser: parser not found") raise ParserNotFoundError.new(url) end parser[1] end |
#get_latest_data_from_storage(url) ⇒ Object
325 326 327 328 329 330 331 332 333 334 335 336 |
# File 'lib/crawline.rb', line 325 def get_latest_data_from_storage(url) @logger.debug("Engine#get_latest_data_from_storage: start: url=#{url}") s3_path = convert_url_to_s3_path(url) data = @repo.get_s3_object(s3_path + ".json") if not data.nil? json_to_data(data) else nil end end |
#json_to_data(json_data) ⇒ Object
316 317 318 319 320 321 322 323 |
# File 'lib/crawline.rb', line 316 def json_to_data(json_data) data = JSON.parse(json_data) data["response_body"] = Base64.urlsafe_decode64(data["response_body"]) data["response_body"].force_encoding("US-ASCII") data["downloaded_timestamp"] = Time.at(data["downloaded_timestamp"], 0).getutc data end |
#parse(url) ⇒ Object
259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 |
# File 'lib/crawline.rb', line 259 def parse(url) @logger.debug("Engine#parse: start: url=#{url}") url_list = [url] result = { "success_url_list" => [], "fail_url_list" => [], "context" => {} } until url_list.empty? do target_url = url_list.shift @logger.debug("Engine#parse: target_url=#{target_url}") begin next_links = parse_impl(target_url, result["context"]) if not next_links.nil? next_links.each do |next_link| url_list << next_link if (not url_list.include?(next_link)) && (not result["success_url_list"].include?(next_link)) && (not result["fail_url_list"].include?(next_link)) end end result["success_url_list"].push(target_url) rescue => err @logger.warn("Engine#parse: parse error") @logger.warn(err) result["fail_url_list"].push(target_url) end @logger.info("Engine#parse: progress: total=#{url_list.size + result["success_url_list"].size + result["fail_url_list"].size}, success=#{result["success_url_list"].size}, fail=#{result["fail_url_list"].size}, remaining=#{url_list.size}") end result["context"] end |
#put_data_to_storage(url, data, related_links) ⇒ Object
362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 |
# File 'lib/crawline.rb', line 362 def put_data_to_storage(url, data, ) @logger.debug("Engine#put_data_to_storage: start: url=#{url}, data=#{data.size if not data.nil?}") s3_path = convert_url_to_s3_path(url) @repo.put_s3_object(s3_path + ".json", data_to_json(data)) # save database cache_data = Model::CrawlineCache.new(url: data["url"], request_method: data["request_method"], downloaded_timestamp: data["downloaded_timestamp"], storage_path: s3_path) headers_data = data["request_headers"].map do |k, v| Model::CrawlineHeader.new(crawline_cache: cache_data, message_type: "request", header_name: k, header_value: v) end headers_data += data["response_headers"].map do |k, v| Model::CrawlineHeader.new(crawline_cache: cache_data, message_type: "response", header_name: k, header_value: v) end if not .nil? urls = else urls = [] end = urls.map do |url| Model::CrawlineRelatedLink.new(crawline_cache: cache_data, url: url) end ActiveRecord::Base.transaction do cache_data.save! Model::CrawlineHeader.import(headers_data) Model::CrawlineRelatedLink.import() end end |