Merge pull request #140 from HolgerDoerner/feature/medium_article_scraper

harshareddy794 · web-flow · commit 93db014970f6 · 2020-10-18T14:29:57.000+05:30
Add medium article scraper
diff --git a/Python/medium_article_scraper/LICENCE b/Python/medium_article_scraper/LICENCE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 Holger Dörner
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Python/medium_article_scraper/README.md b/Python/medium_article_scraper/README.md
@@ -0,0 +1,41 @@
+# mediumScreaper
+A screaper for Medium.com -posts
+
+This script searches on `Medium.com` for a supplied `Topic` and returns the results as `JSON`.
+
+## Requirements Dependancies
+Recommendet Python-Version
+- `3.5+`
+
+The only external dependancy is
+- `BeautifulSoap`
+
+## Installation
+Install the required external dependancies with
+
+```shell
+$ pip3 install -r requirements.txt
+```
+
+Also, a `chmod +x medium_scraper.py` *can* be needed to make the script executable, otherwise You have to call it with `python3` (see [Usage Examples](#usage-examples))
+
+## Usage examples
+Getting help:
+```shell
+$ (python3) medium_scraper.py -h
+```
+
+Get posts for `python`:
+```shell
+$ (python3) medium_scraper.py python
+```
+
+Get maximum `100` posts for `software development`:
+```shell
+$ (python3) medium_scraper.py "software development" -c 100
+```
+
+Pretty-Print the `json` output:
+```shell
+$ (python3) medium_scraper.py python -b
+```
diff --git a/Python/medium_article_scraper/medium_scraper.py b/Python/medium_article_scraper/medium_scraper.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+
+import requests
+import argparse
+import sys
+import json
+from urllib import parse
+from bs4 import BeautifulSoup
+
+
+def parseArgs():
+    parser = argparse.ArgumentParser(description='Gets posts from Medium.com for a specific topic.')
+    parser.add_argument('topic', metavar='TOPIC', type=str,
+                        help='the topic to search for')
+    parser.add_argument('-c', '--count', dest='count', action='store', default=15,
+                        help='maximum number of posts')
+    parser.add_argument('-b', '--beautify', dest='beautiefy', action='store_true',
+                        help='beautiefy json output')
+
+    return parser.parse_args()
+
+
+def run(args):
+    def parsePost(tag):
+        title = tag.find('h3', class_='graf')
+        desc = tag.find('p')
+        url = tag.find_all('a')[3]
+        return {
+            'title': title.text if title else '',
+            'desc': desc.text if desc else '',
+            'url': url.get('href').split('?')[0] if url else '',
+        }
+
+    urlParams = {
+        'topic': parse.quote(args.topic),
+        'count': args.count
+    }
+
+    url = 'https://medium.com/search/posts?q={topic}&count={count}'.format_map(urlParams)
+
+    posts = []
+
+    response = requests.get(url)
+
+    soup = BeautifulSoup(response.text, 'html.parser')
+    rawPosts = soup.find_all('div', class_='postArticle')
+
+    if len(rawPosts) > 0:
+        for post in rawPosts:
+            posts.append(parsePost(post))
+    else:
+        print('No posts found for "%s"...' % args.topic)
+        sys.exit(0)
+
+    print(json.dumps(posts, indent=(4 if args.beautiefy else None)))
+
+
+if __name__ == '__main__':
+    run(parseArgs())
diff --git a/Python/medium_article_scraper/requirements.txt b/Python/medium_article_scraper/requirements.txt
@@ -0,0 +1 @@
+BeautifulSoup4