Class: Crawline::Engine

Inherits:
Object
  • Object
show all
Defined in:
lib/crawline.rb

Instance Method Summary collapse

Constructor Details

#initialize(downloader, repo, parsers, interval = 1.0) ⇒ Engine

Returns a new instance of Engine

Raises:

  • (ArgumentError)


205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# File 'lib/crawline.rb', line 205

def initialize(downloader, repo, parsers, interval = 1.0)
  @logger = CrawlineLogger.get_logger
  @logger.debug("Engine#initialize: start: downloader=#{downloader}, repo=#{repo}, parsers=#{parsers}")

  raise ArgumentError, "downloader is nil." if downloader.nil?
  raise ArgumentError, "repo is nil." if repo.nil?
  raise ArgumentError, "parsers is nil." if parsers.nil?

  raise TypeError, "downloader is not Crawline::Downloader." if not downloader.is_a?(Crawline::Downloader)
  raise TypeError, "repo is not Crawline::ResourceRepository." if not repo.is_a?(Crawline::ResourceRepository)
  parsers.each do |url_pattern, parser|
    raise TypeError, "parsers is not Hash<Regexp, Parser>." if not url_pattern.is_a?(Regexp)
    raise TypeError, "parsers is not Hash<Regexp, Parser>." if not (parser < Crawline::BaseParser)
  end

  @downloader = downloader
  @repo = repo
  @parsers = parsers
  @interval = interval
end

Instance Method Details

#convert_url_to_s3_path(url) ⇒ Object



398
399
400
401
# File 'lib/crawline.rb', line 398

def convert_url_to_s3_path(url)
  path = OpenSSL::Digest::SHA256.hexdigest(url)
  path = path[0..1] + "/" + path
end

#crawl(url) ⇒ Object



226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# File 'lib/crawline.rb', line 226

def crawl(url)
  @logger.debug("Engine#crawl: start: url=#{url}")

  url_list = [url]
  result = { "success_url_list" => [], "fail_url_list" => [], "context" => {} }

  until url_list.empty? do
    target_url = url_list.shift
    @logger.debug("Engine#crawl: target_url=#{target_url}")

    begin
      next_links = crawl_impl(target_url, result["context"])

      if not next_links.nil?
        next_links.each do |next_link|
          url_list << next_link if (not url_list.include?(next_link)) && (not result["success_url_list"].include?(next_link)) && (not result["fail_url_list"].include?(next_link))
        end
      end

      result["success_url_list"].push(target_url)
    rescue => err
      @logger.warn("Engine#crawl: crawl error")
      @logger.warn(err)

      result["fail_url_list"].push(target_url)
    end

    @logger.info("Engine#crawl: progress: total=#{url_list.size + result["success_url_list"].size + result["fail_url_list"].size}, success=#{result["success_url_list"].size}, fail=#{result["fail_url_list"].size}, remaining=#{url_list.size}")
  end

  result
end

#data_to_json(data) ⇒ Object



308
309
310
311
312
313
314
# File 'lib/crawline.rb', line 308

def data_to_json(data)
  json_data = Marshal.load(Marshal.dump(data))
  json_data["response_body"] = Base64.urlsafe_encode64(json_data["response_body"])
  json_data["downloaded_timestamp"] = json_data["downloaded_timestamp"].to_i

  json_data.to_json
end

#download_or_redownload(url, parser, data) ⇒ Object



338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
# File 'lib/crawline.rb', line 338

def download_or_redownload(url, parser, data)
  @logger.debug("Engine#download_or_redownload: start: url=#{url}, parser=#{parser}, data.nil?=#{data.nil?}")

  if data.nil?
    @logger.debug("Engine#download_or_redownload: download")

    sleep(@interval)
    new_data = @downloader.download_with_get(url)
  else
    parser_instance = parser.new(url, data)

    if parser_instance.redownload?
      @logger.debug("Engine#download_or_redownload: redownload")

      sleep(@interval)
      new_data = @downloader.download_with_get(url)
    else
      @logger.debug("Engine#download_or_redownload: skip")

      nil
    end
  end
end

#find_parser(url) ⇒ Object



292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# File 'lib/crawline.rb', line 292

def find_parser(url)
  @logger.debug("Engine#find_parser: start: url=#{url}")

  parser = @parsers.find do |url_pattern, clazz|
    url_pattern.match(url)
  end
  @logger.debug("Engine#find_parser: parser=#{parser}")

  if parser.nil?
    @logger.debug("Engine#find_parser: parser not found")
    raise ParserNotFoundError.new(url)
  end

  parser[1]
end

#get_latest_data_from_storage(url) ⇒ Object



325
326
327
328
329
330
331
332
333
334
335
336
# File 'lib/crawline.rb', line 325

def get_latest_data_from_storage(url)
  @logger.debug("Engine#get_latest_data_from_storage: start: url=#{url}")

  s3_path = convert_url_to_s3_path(url)
  data = @repo.get_s3_object(s3_path + ".json")

  if not data.nil?
    json_to_data(data)
  else
    nil
  end
end

#json_to_data(json_data) ⇒ Object



316
317
318
319
320
321
322
323
# File 'lib/crawline.rb', line 316

def json_to_data(json_data)
  data = JSON.parse(json_data)
  data["response_body"] = Base64.urlsafe_decode64(data["response_body"])
  data["response_body"].force_encoding("US-ASCII")
  data["downloaded_timestamp"] = Time.at(data["downloaded_timestamp"], 0).getutc

  data
end

#parse(url) ⇒ Object



259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
# File 'lib/crawline.rb', line 259

def parse(url)
  @logger.debug("Engine#parse: start: url=#{url}")

  url_list = [url]
  result = { "success_url_list" => [], "fail_url_list" => [], "context" => {} }

  until url_list.empty? do
    target_url = url_list.shift
    @logger.debug("Engine#parse: target_url=#{target_url}")

    begin
      next_links = parse_impl(target_url, result["context"])

      if not next_links.nil?
        next_links.each do |next_link|
          url_list << next_link if (not url_list.include?(next_link)) && (not result["success_url_list"].include?(next_link)) && (not result["fail_url_list"].include?(next_link))
        end
      end

      result["success_url_list"].push(target_url)
    rescue => err
      @logger.warn("Engine#parse: parse error")
      @logger.warn(err)

      result["fail_url_list"].push(target_url)
    end

    @logger.info("Engine#parse: progress: total=#{url_list.size + result["success_url_list"].size + result["fail_url_list"].size}, success=#{result["success_url_list"].size}, fail=#{result["fail_url_list"].size}, remaining=#{url_list.size}")
  end

  result["context"]
end

#put_data_to_storage(url, data, related_links) ⇒ Object



362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
# File 'lib/crawline.rb', line 362

def put_data_to_storage(url, data, related_links)
  @logger.debug("Engine#put_data_to_storage: start: url=#{url}, data=#{data.size if not data.nil?}")

  s3_path = convert_url_to_s3_path(url)
  @repo.put_s3_object(s3_path + ".json", data_to_json(data))

  # save database
  cache_data = Model::CrawlineCache.new(url: data["url"], request_method: data["request_method"], downloaded_timestamp: data["downloaded_timestamp"], storage_path: s3_path)

  headers_data = data["request_headers"].map do |k, v|
    Model::CrawlineHeader.new(crawline_cache: cache_data, message_type: "request", header_name: k, header_value: v)
  end

  headers_data += data["response_headers"].map do |k, v|
    Model::CrawlineHeader.new(crawline_cache: cache_data, message_type: "response", header_name: k, header_value: v)
  end

  if not related_links.nil?
    urls = related_links
  else
    urls = []
  end

  related_links_data = urls.map do |url|
    Model::CrawlineRelatedLink.new(crawline_cache: cache_data, url: url)
  end

  ActiveRecord::Base.transaction do
    cache_data.save!

    Model::CrawlineHeader.import(headers_data)

    Model::CrawlineRelatedLink.import(related_links_data)
  end
end