Merge pull request #15 from hahwul/dev

Release v1.3.5
hahwul · Sep 25, 2024 · b779789 · b779789
2 parents 6701abc + 0690ef1
commit b779789
Show file tree

Hide file tree

Showing 10 changed files with 195 additions and 70 deletions.
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -0,0 +1,22 @@
+---
+config:
+  - changed-files:
+      - any-glob-to-any-file: [deadfinder.gemspec, ruby-version, .rubocop.yml]
+dependencies:
+  - changed-files:
+      - any-glob-to-any-file: [Gemfile, Gemfile.lock, deadfinder.gemspec]
+workflow:
+  - changed-files:
+      - any-glob-to-any-file: [.github/workflows/**, .github/labeler.yml]
+github-action:
+  - changed-files:
+      - any-glob-to-any-file: [github-action/**, action.yml]
+docker:
+  - changed-files:
+      - any-glob-to-any-file:
+          - Dockerfile
+          - .github/workflows/docker-ghcr.yml
+          - github-action/Dockerfile
+code:
+  - changed-files:
+      - any-glob-to-any-file: [lib/**, bin/**, spec/**]
diff --git a/.github/workflows/contributors.yml b/.github/workflows/contributors.yml
@@ -0,0 +1,19 @@
+---
+    name: Contributors
+    on:
+      push:
+        branches: [main]
+      workflow_dispatch:
+        inputs:
+          logLevel:
+            description: manual run
+            required: false
+            default: ''
+    jobs:
+      contributors:
+        runs-on: ubuntu-latest
+        steps:
+          - uses: wow-actions/contributors-list@v1
+            with:
+              GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+              round: true
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
@@ -0,0 +1,11 @@
+---
+    name: Pull Request Labeler
+    on: [pull_request_target]
+    jobs:
+      labeler:
+        permissions:
+          contents: read
+          pull-requests: write
+        runs-on: ubuntu-latest
+        steps:
+          - uses: actions/labeler@v5
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -1,35 +1,36 @@
 GEM
   remote: https://rubygems.org/
   specs:
-    colorize (0.8.1)
-    concurrent-ruby (1.1.10)
-    concurrent-ruby-edge (0.6.0)
-      concurrent-ruby (~> 1.1.6)
+    colorize (1.1.0)
+    concurrent-ruby (1.3.4)
+    concurrent-ruby-edge (0.7.1)
+      concurrent-ruby (~> 1.3)
     date (3.3.4)
-    ethon (0.15.0)
+    ethon (0.16.0)
       ffi (>= 1.15.0)
-    ffi (1.15.5)
-    json (2.6.3)
-    mini_portile2 (2.8.2)
-    nokogiri (1.13.8)
-      mini_portile2 (~> 2.8.0)
+    ffi (1.17.0-arm64-darwin)
+    ffi (1.17.0-x86_64-darwin)
+    json (2.7.2)
+    nokogiri (1.16.7-arm64-darwin)
+      racc (~> 1.4)
+    nokogiri (1.16.7-x86_64-darwin)
       racc (~> 1.4)
     open-uri (0.4.1)
       stringio
       time
       uri
-    racc (1.6.0)
-    set (1.0.3)
+    racc (1.8.1)
+    set (1.1.0)
     sitemap-parser (0.5.6)
       nokogiri (~> 1.6)
       typhoeus (>= 0.6, < 2.0)
-    stringio (3.1.0)
-    thor (1.2.1)
-    time (0.3.0)
+    stringio (3.1.1)
+    thor (1.3.2)
+    time (0.4.0)
       date
-    typhoeus (1.4.0)
+    typhoeus (1.4.1)
       ethon (>= 0.9.0)
-    uri (0.13.0)
+    uri (0.13.1)
 
 PLATFORMS
   arm64-darwin-22

diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ deadfinder sitemap https://www.hahwul.com/sitemap.xml
 ```yml
 steps:
 - name: Run DeadFinder
-  uses: hahwul/[email protected].4
+  uses: hahwul/[email protected].5
   id: broken-link
   with:
     command: sitemap

diff --git a/github-action/Dockerfile b/github-action/Dockerfile
@@ -1,4 +1,4 @@
-FROM ghcr.io/hahwul/deadfinder:1.3.4
+FROM ghcr.io/hahwul/deadfinder:1.3.5
 COPY entrypoint.sh /entrypoint.sh
 RUN chmod 755 /entrypoint.sh
 ENTRYPOINT ["/entrypoint.sh"]
diff --git a/lib/deadfinder.rb b/lib/deadfinder.rb
@@ -13,49 +13,37 @@
 require 'json'
 
 Channel = Concurrent::Channel
-CacheSet = Set.new
-CacheQue = {}
-Output = {}
+CacheSet = Concurrent::Map.new
+CacheQue = Concurrent::Map.new
+Output = Concurrent::Map.new
 
 class DeadFinderRunner
   def run(target, options)
-    page = nil
-
-    if options['headers'].length.positive?
-      headers = {}
-      options['headers'].each do |header|
-        kv = header.split ': '
-        headers[kv[0]] = kv[1]
-      rescue StandardError
-      end
-
-      page = Nokogiri::HTML(URI.open(target, headers))
-    else
-      page = Nokogiri::HTML(URI.open(target))
+    headers = options['headers'].each_with_object({}) do |header, hash|
+      kv = header.split(': ')
+      hash[kv[0]] = kv[1]
+    rescue StandardError
     end
+    page = Nokogiri::HTML(URI.open(target, headers))
+    links = extract_links(page)
 
-    nodeset_a = page.css('a')
-    link_a = nodeset_a.map { |element| element['href'] }.compact
-    nodeset_script = page.css('script')
-    link_script = nodeset_script.map { |element| element['src'] }.compact
-    nodeset_link = page.css('link')
-    link_link = nodeset_link.map { |element| element['href'] }.compact
-
-    link_merged = []
-    link_merged.concat link_a, link_script, link_link
+    total_links_count = links.values.flatten.length
+    # Generate link info string for non-empty link types
+    link_info = links.map { |type, urls| "#{type}:#{urls.length}" if urls.length.positive? }.compact.join(' / ')
 
-    Logger.target target
-    Logger.sub_info "Found #{link_merged.length} point. [a:#{link_a.length}/s:#{link_script.length}/l:#{link_link.length}]"
+    # Log the information if there are any links
+    Logger.sub_info "Found #{total_links_count} URLs. [#{link_info}]" unless link_info.empty?
     Logger.sub_info 'Checking'
-    jobs    = Channel.new(buffer: :buffered, capacity: 1000)
+
+    jobs = Channel.new(buffer: :buffered, capacity: 1000)
     results = Channel.new(buffer: :buffered, capacity: 1000)
 
     (1..options['concurrency']).each do |w|
       Channel.go { worker(w, jobs, results, target, options) }
     end
 
-    link_merged.uniq.each do |node|
-      result = generate_url node, target
+    links.values.flatten.uniq.each do |node|
+      result = generate_url(node, target)
       jobs << result unless result.nil?
     end
 
@@ -72,49 +60,63 @@ def run(target, options)
 
   def worker(_id, jobs, results, target, options)
     jobs.each do |j|
-      if !CacheSet.include? j
-        CacheSet.add j
+      if CacheSet[j]
+        Logger.found "[404 Not Found] #{j}" unless CacheQue[j]
+      else
+        CacheSet[j] = true
         begin
           CacheQue[j] = true
           URI.open(j, read_timeout: options['timeout'])
         rescue StandardError => e
           if e.to_s.include? '404 Not Found'
             Logger.found "[#{e}] #{j}"
             CacheQue[j] = false
-            Output[target] = [] if Output[target].nil?
-            Output[target].push j
+            Output[target] ||= []
+            Output[target] << j
           end
         end
-      elsif !CacheQue[j]
-        Logger.found "[404 Not Found] #{j}"
       end
       results << j
     end
   end
+
+  private
+
+  def extract_links(page)
+    {
+      anchor: page.css('a').map { |element| element['href'] }.compact,
+      script: page.css('script').map { |element| element['src'] }.compact,
+      link: page.css('link').map { |element| element['href'] }.compact,
+      iframe: page.css('iframe').map { |element| element['src'] }.compact,
+      form: page.css('form').map { |element| element['action'] }.compact,
+      object: page.css('object').map { |element| element['data'] }.compact,
+      embed: page.css('embed').map { |element| element['src'] }.compact
+    }
+  end
 end
 
 def run_pipe(options)
   app = DeadFinderRunner.new
   while $stdin.gets
-    target = $LAST_READ_LINE.gsub("\n", '')
+    target = $LAST_READ_LINE.chomp
     app.run target, options
   end
-  gen_output
+  gen_output(options)
 end
 
 def run_file(filename, options)
   app = DeadFinderRunner.new
-  File.open(filename).each do |line|
-    target = line.gsub("\n", '')
+  File.foreach(filename) do |line|
+    target = line.chomp
     app.run target, options
   end
-  gen_output
+  gen_output(options)
 end
 
 def run_url(url, options)
   app = DeadFinderRunner.new
   app.run url, options
-  gen_output
+  gen_output(options)
 end
 
 def run_sitemap(sitemap_url, options)
@@ -125,15 +127,15 @@ def run_sitemap(sitemap_url, options)
     turl = generate_url url, base_uri
     app.run turl, options
   end
-  gen_output
+  gen_output(options)
 end
 
-def gen_output
-  File.write options['output'], Output.to_json if options['output'] != ''
+def gen_output(options)
+  File.write(options['output'], Output.to_json) unless options['output'].empty?
 end
 
 class DeadFinder < Thor
-  class_option :concurrency, aliases: :c, default: 20, type: :numeric, desc: 'Number of concurrncy'
+  class_option :concurrency, aliases: :c, default: 50, type: :numeric, desc: 'Number of concurrency'
   class_option :timeout, aliases: :t, default: 10, type: :numeric, desc: 'Timeout in seconds'
   class_option :output, aliases: :o, default: '', type: :string, desc: 'File to write JSON result'
   class_option :headers, aliases: :H, default: [], type: :array, desc: 'Custom HTTP headers to send with request'

diff --git a/lib/deadfinder/utils.rb b/lib/deadfinder/utils.rb
@@ -28,12 +28,10 @@ def ignore_scheme?(url)
 end
 
 def extract_directory(uri)
-  if uri.path.end_with?('/')
-    return "#{uri.scheme}://#{uri.host}#{uri.path}"
-  end
+  return "#{uri.scheme}://#{uri.host}#{uri.path}" if uri.path.end_with?('/')
 
   path_components = uri.path.split('/')
-  last_component = path_components.last
+  path_components.last
   path_components.pop
 
   directory_path = path_components.join('/')

diff --git a/lib/deadfinder/version.rb b/lib/deadfinder/version.rb
@@ -1,3 +1,3 @@
 # frozen_string_literal: true
 
-VERSION = '1.3.4'
+VERSION = '1.3.5'
diff --git a/spec/utils_spec.rb b/spec/utils_spec.rb
@@ -0,0 +1,72 @@
+# frozen_string_literal: true
+
+require 'uri'
+require_relative '../lib/deadfinder/utils'
+
+RSpec.describe 'Utils' do
+  describe '#generate_url' do
+    let(:base_url) { 'http://example.com/base/' }
+
+    it 'returns the original URL if it starts with http://' do
+      expect(generate_url('http://example.com', base_url)).to eq('http://example.com')
+    end
+
+    it 'returns the original URL if it starts with https://' do
+      expect(generate_url('https://example.com', base_url)).to eq('https://example.com')
+    end
+
+    it 'prepends the scheme if the URL starts with //' do
+      expect(generate_url('//example.com', base_url)).to eq('http://example.com')
+    end
+
+    it 'prepends the scheme and host if the URL starts with /' do
+      expect(generate_url('/path', base_url)).to eq('http://example.com/path')
+    end
+
+    it 'returns nil if the URL should ignore the scheme' do
+      expect(generate_url('mailto:[email protected]', base_url)).to be_nil
+    end
+
+    it 'prepends the base directory if the URL is relative' do
+      expect(generate_url('relative/path', base_url)).to eq('http://example.com/base/relative/path')
+    end
+  end
+
+  describe '#ignore_scheme?' do
+    it 'returns true for mailto: URLs' do
+      expect(ignore_scheme?('mailto:[email protected]')).to be true
+    end
+
+    it 'returns true for tel: URLs' do
+      expect(ignore_scheme?('tel:1234567890')).to be true
+    end
+
+    it 'returns true for sms: URLs' do
+      expect(ignore_scheme?('sms:1234567890')).to be true
+    end
+
+    it 'returns true for data: URLs' do
+      expect(ignore_scheme?('data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==')).to be true
+    end
+
+    it 'returns true for file: URLs' do
+      expect(ignore_scheme?('file:///path/to/file')).to be true
+    end
+
+    it 'returns false for other URLs' do
+      expect(ignore_scheme?('http://example.com')).to be false
+    end
+  end
+
+  describe '#extract_directory' do
+    it 'returns the base URL if the path ends with /' do
+      uri = URI('http://example.com/base/')
+      expect(extract_directory(uri)).to eq('http://example.com/base/')
+    end
+
+    it 'returns the directory path if the path does not end with /' do
+      uri = URI('http://example.com/base/file')
+      expect(extract_directory(uri)).to eq('http://example.com/base/')
+    end
+  end
+end