Skip to content

Commit

Permalink
Merge pull request #15 from hahwul/dev
Browse files Browse the repository at this point in the history
Release v1.3.5
  • Loading branch information
hahwul authored Sep 25, 2024
2 parents 6701abc + 0690ef1 commit b779789
Show file tree
Hide file tree
Showing 10 changed files with 195 additions and 70 deletions.
22 changes: 22 additions & 0 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
---
config:
- changed-files:
- any-glob-to-any-file: [deadfinder.gemspec, ruby-version, .rubocop.yml]
dependencies:
- changed-files:
- any-glob-to-any-file: [Gemfile, Gemfile.lock, deadfinder.gemspec]
workflow:
- changed-files:
- any-glob-to-any-file: [.github/workflows/**, .github/labeler.yml]
github-action:
- changed-files:
- any-glob-to-any-file: [github-action/**, action.yml]
docker:
- changed-files:
- any-glob-to-any-file:
- Dockerfile
- .github/workflows/docker-ghcr.yml
- github-action/Dockerfile
code:
- changed-files:
- any-glob-to-any-file: [lib/**, bin/**, spec/**]
19 changes: 19 additions & 0 deletions .github/workflows/contributors.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
name: Contributors
on:
push:
branches: [main]
workflow_dispatch:
inputs:
logLevel:
description: manual run
required: false
default: ''
jobs:
contributors:
runs-on: ubuntu-latest
steps:
- uses: wow-actions/contributors-list@v1
with:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
round: true
11 changes: 11 additions & 0 deletions .github/workflows/labeler.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
name: Pull Request Labeler
on: [pull_request_target]
jobs:
labeler:
permissions:
contents: read
pull-requests: write
runs-on: ubuntu-latest
steps:
- uses: actions/labeler@v5
35 changes: 18 additions & 17 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,35 +1,36 @@
GEM
remote: https://rubygems.org/
specs:
colorize (0.8.1)
concurrent-ruby (1.1.10)
concurrent-ruby-edge (0.6.0)
concurrent-ruby (~> 1.1.6)
colorize (1.1.0)
concurrent-ruby (1.3.4)
concurrent-ruby-edge (0.7.1)
concurrent-ruby (~> 1.3)
date (3.3.4)
ethon (0.15.0)
ethon (0.16.0)
ffi (>= 1.15.0)
ffi (1.15.5)
json (2.6.3)
mini_portile2 (2.8.2)
nokogiri (1.13.8)
mini_portile2 (~> 2.8.0)
ffi (1.17.0-arm64-darwin)
ffi (1.17.0-x86_64-darwin)
json (2.7.2)
nokogiri (1.16.7-arm64-darwin)
racc (~> 1.4)
nokogiri (1.16.7-x86_64-darwin)
racc (~> 1.4)
open-uri (0.4.1)
stringio
time
uri
racc (1.6.0)
set (1.0.3)
racc (1.8.1)
set (1.1.0)
sitemap-parser (0.5.6)
nokogiri (~> 1.6)
typhoeus (>= 0.6, < 2.0)
stringio (3.1.0)
thor (1.2.1)
time (0.3.0)
stringio (3.1.1)
thor (1.3.2)
time (0.4.0)
date
typhoeus (1.4.0)
typhoeus (1.4.1)
ethon (>= 0.9.0)
uri (0.13.0)
uri (0.13.1)

PLATFORMS
arm64-darwin-22
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ deadfinder sitemap https://www.hahwul.com/sitemap.xml
```yml
steps:
- name: Run DeadFinder
uses: hahwul/[email protected].4
uses: hahwul/[email protected].5
id: broken-link
with:
command: sitemap
Expand Down
2 changes: 1 addition & 1 deletion github-action/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM ghcr.io/hahwul/deadfinder:1.3.4
FROM ghcr.io/hahwul/deadfinder:1.3.5
COPY entrypoint.sh /entrypoint.sh
RUN chmod 755 /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
94 changes: 48 additions & 46 deletions lib/deadfinder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,49 +13,37 @@
require 'json'

Channel = Concurrent::Channel
CacheSet = Set.new
CacheQue = {}
Output = {}
CacheSet = Concurrent::Map.new
CacheQue = Concurrent::Map.new
Output = Concurrent::Map.new

class DeadFinderRunner
def run(target, options)
page = nil

if options['headers'].length.positive?
headers = {}
options['headers'].each do |header|
kv = header.split ': '
headers[kv[0]] = kv[1]
rescue StandardError
end

page = Nokogiri::HTML(URI.open(target, headers))
else
page = Nokogiri::HTML(URI.open(target))
headers = options['headers'].each_with_object({}) do |header, hash|
kv = header.split(': ')
hash[kv[0]] = kv[1]
rescue StandardError
end
page = Nokogiri::HTML(URI.open(target, headers))
links = extract_links(page)

nodeset_a = page.css('a')
link_a = nodeset_a.map { |element| element['href'] }.compact
nodeset_script = page.css('script')
link_script = nodeset_script.map { |element| element['src'] }.compact
nodeset_link = page.css('link')
link_link = nodeset_link.map { |element| element['href'] }.compact

link_merged = []
link_merged.concat link_a, link_script, link_link
total_links_count = links.values.flatten.length
# Generate link info string for non-empty link types
link_info = links.map { |type, urls| "#{type}:#{urls.length}" if urls.length.positive? }.compact.join(' / ')

Logger.target target
Logger.sub_info "Found #{link_merged.length} point. [a:#{link_a.length}/s:#{link_script.length}/l:#{link_link.length}]"
# Log the information if there are any links
Logger.sub_info "Found #{total_links_count} URLs. [#{link_info}]" unless link_info.empty?
Logger.sub_info 'Checking'
jobs = Channel.new(buffer: :buffered, capacity: 1000)

jobs = Channel.new(buffer: :buffered, capacity: 1000)
results = Channel.new(buffer: :buffered, capacity: 1000)

(1..options['concurrency']).each do |w|
Channel.go { worker(w, jobs, results, target, options) }
end

link_merged.uniq.each do |node|
result = generate_url node, target
links.values.flatten.uniq.each do |node|
result = generate_url(node, target)
jobs << result unless result.nil?
end

Expand All @@ -72,49 +60,63 @@ def run(target, options)

def worker(_id, jobs, results, target, options)
jobs.each do |j|
if !CacheSet.include? j
CacheSet.add j
if CacheSet[j]
Logger.found "[404 Not Found] #{j}" unless CacheQue[j]
else
CacheSet[j] = true
begin
CacheQue[j] = true
URI.open(j, read_timeout: options['timeout'])
rescue StandardError => e
if e.to_s.include? '404 Not Found'
Logger.found "[#{e}] #{j}"
CacheQue[j] = false
Output[target] = [] if Output[target].nil?
Output[target].push j
Output[target] ||= []
Output[target] << j
end
end
elsif !CacheQue[j]
Logger.found "[404 Not Found] #{j}"
end
results << j
end
end

private

def extract_links(page)
{
anchor: page.css('a').map { |element| element['href'] }.compact,
script: page.css('script').map { |element| element['src'] }.compact,
link: page.css('link').map { |element| element['href'] }.compact,
iframe: page.css('iframe').map { |element| element['src'] }.compact,
form: page.css('form').map { |element| element['action'] }.compact,
object: page.css('object').map { |element| element['data'] }.compact,
embed: page.css('embed').map { |element| element['src'] }.compact
}
end
end

def run_pipe(options)
app = DeadFinderRunner.new
while $stdin.gets
target = $LAST_READ_LINE.gsub("\n", '')
target = $LAST_READ_LINE.chomp
app.run target, options
end
gen_output
gen_output(options)
end

def run_file(filename, options)
app = DeadFinderRunner.new
File.open(filename).each do |line|
target = line.gsub("\n", '')
File.foreach(filename) do |line|
target = line.chomp
app.run target, options
end
gen_output
gen_output(options)
end

def run_url(url, options)
app = DeadFinderRunner.new
app.run url, options
gen_output
gen_output(options)
end

def run_sitemap(sitemap_url, options)
Expand All @@ -125,15 +127,15 @@ def run_sitemap(sitemap_url, options)
turl = generate_url url, base_uri
app.run turl, options
end
gen_output
gen_output(options)
end

def gen_output
File.write options['output'], Output.to_json if options['output'] != ''
def gen_output(options)
File.write(options['output'], Output.to_json) unless options['output'].empty?
end

class DeadFinder < Thor
class_option :concurrency, aliases: :c, default: 20, type: :numeric, desc: 'Number of concurrncy'
class_option :concurrency, aliases: :c, default: 50, type: :numeric, desc: 'Number of concurrency'
class_option :timeout, aliases: :t, default: 10, type: :numeric, desc: 'Timeout in seconds'
class_option :output, aliases: :o, default: '', type: :string, desc: 'File to write JSON result'
class_option :headers, aliases: :H, default: [], type: :array, desc: 'Custom HTTP headers to send with request'
Expand Down
6 changes: 2 additions & 4 deletions lib/deadfinder/utils.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,10 @@ def ignore_scheme?(url)
end

def extract_directory(uri)
if uri.path.end_with?('/')
return "#{uri.scheme}://#{uri.host}#{uri.path}"
end
return "#{uri.scheme}://#{uri.host}#{uri.path}" if uri.path.end_with?('/')

path_components = uri.path.split('/')
last_component = path_components.last
path_components.last
path_components.pop

directory_path = path_components.join('/')
Expand Down
2 changes: 1 addition & 1 deletion lib/deadfinder/version.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# frozen_string_literal: true

VERSION = '1.3.4'
VERSION = '1.3.5'
72 changes: 72 additions & 0 deletions spec/utils_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# frozen_string_literal: true

require 'uri'
require_relative '../lib/deadfinder/utils'

RSpec.describe 'Utils' do
describe '#generate_url' do
let(:base_url) { 'http://example.com/base/' }

it 'returns the original URL if it starts with http://' do
expect(generate_url('http://example.com', base_url)).to eq('http://example.com')
end

it 'returns the original URL if it starts with https://' do
expect(generate_url('https://example.com', base_url)).to eq('https://example.com')
end

it 'prepends the scheme if the URL starts with //' do
expect(generate_url('//example.com', base_url)).to eq('http://example.com')
end

it 'prepends the scheme and host if the URL starts with /' do
expect(generate_url('/path', base_url)).to eq('http://example.com/path')
end

it 'returns nil if the URL should ignore the scheme' do
expect(generate_url('mailto:[email protected]', base_url)).to be_nil
end

it 'prepends the base directory if the URL is relative' do
expect(generate_url('relative/path', base_url)).to eq('http://example.com/base/relative/path')
end
end

describe '#ignore_scheme?' do
it 'returns true for mailto: URLs' do
expect(ignore_scheme?('mailto:[email protected]')).to be true
end

it 'returns true for tel: URLs' do
expect(ignore_scheme?('tel:1234567890')).to be true
end

it 'returns true for sms: URLs' do
expect(ignore_scheme?('sms:1234567890')).to be true
end

it 'returns true for data: URLs' do
expect(ignore_scheme?('data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==')).to be true
end

it 'returns true for file: URLs' do
expect(ignore_scheme?('file:///path/to/file')).to be true
end

it 'returns false for other URLs' do
expect(ignore_scheme?('http://example.com')).to be false
end
end

describe '#extract_directory' do
it 'returns the base URL if the path ends with /' do
uri = URI('http://example.com/base/')
expect(extract_directory(uri)).to eq('http://example.com/base/')
end

it 'returns the directory path if the path does not end with /' do
uri = URI('http://example.com/base/file')
expect(extract_directory(uri)).to eq('http://example.com/base/')
end
end
end

0 comments on commit b779789

Please sign in to comment.