Add files via upload

parth818 · web-flow · commit d50784bd17ec · 2019-07-27T14:53:45.000+05:30
diff --git a/insta_image_saving/instagram_image_scrapping.ipynb b/insta_image_saving/instagram_image_scrapping.ipynb
@@ -0,0 +1,174 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from time import sleep\n",
+    "import scrapy\n",
+    "import pandas as pd\n",
+    "from scrapy import Spider\n",
+    "from selenium import webdriver\n",
+    "from scrapy.selector import Selector\n",
+    "from io import BytesIO\n",
+    "from PIL import Image\n",
+    "import os\n",
+    "import requests"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1951\n"
+     ]
+    }
+   ],
+   "source": [
+    "imageID = []\n",
+    "sl_no = []\n",
+    "imageLikes = []\n",
+    "i = 0\n",
+    "instaccountlink = \"https://instagram.com/audi\"\n",
+    "instaaccountname = \"Audi\"\n",
+    "driver = webdriver.Chrome(\"driver/driver\")\n",
+    "driver.get(instaccountlink)\n",
+    "unique_urls = []\n",
+    "while i<300:\n",
+    "    i = i +1\n",
+    "    sel = Selector(text=driver.page_source)\n",
+    "    \n",
+    "    url = sel.xpath('//div[@class=\"v1Nh3 kIKUG  _bz0w\"]/a/@href').extract()\n",
+    "    for u in url:\n",
+    "        if u not in unique_urls:\n",
+    "            unique_urls.append(u)\n",
+    "            \n",
+    "    driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n",
+    "    sel = Selector(text=driver.page_source)\n",
+    "    url = sel.xpath('//div[@class=\"v1Nh3 kIKUG  _bz0w\"]/a/@href').extract()\n",
+    "    sleep(1)\n",
+    "    for u in url:\n",
+    "        if u not in unique_urls:\n",
+    "            unique_urls.append(u)\n",
+    "            \n",
+    "driver.quit()\n",
+    "print(len(unique_urls))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "file saved successfully\n"
+     ]
+    }
+   ],
+   "source": [
+    "file = open(\"output/audi_instagram_11_07_2019.csv\",\"a\")\n",
+    "for u in unique_urls :\n",
+    "    file.write(u)\n",
+    "    file.write(\"\\n\")\n",
+    "file.close()\n",
+    "print(\"file saved successfully\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# saving the images to specified directory\n",
+    "driver = webdriver.Chrome('driver/driver')\n",
+    "\n",
+    "image_urls = []\n",
+    "count = 0\n",
+    "max_no_of_iteration=250\n",
+    "for u in unique_urls:\n",
+    "    try:\n",
+    "        driver.get('http://instagram.com'+u)\n",
+    "        sel = Selector(text=driver.page_source)\n",
+    "\n",
+    "        src= sel.xpath('//div/img/@src').extract()[0]\n",
+    "#             print(src)\n",
+    "        r = requests.get(src)\n",
+    "        \n",
+    "        image = Image.open(BytesIO(r.content))\n",
+    "#         path = \"C:/Users/carbon/Desktop/output/\"+instaAccountName+str(count)+\".\" + image.format    \n",
+    "        path = \"output/\"+instaaccountname+str(count)+\".\" + image.format\n",
+    "#             print(image.size, image.format, image.mode)\n",
+    "        q1=''\n",
+    "        q2=''\n",
+    "        try:\n",
+    "            image.save(path, image.format)\n",
+    "            q1 = instaaccountname+str(count)\n",
+    "            q2 = sel.xpath('//span/span/text()').extract_first()\n",
+    "#             print(q1)\n",
+    "#             print(q2)\n",
+    "\n",
+    "        except IOError:\n",
+    "            q1=''\n",
+    "            q2=''\n",
+    "        imageID.insert(len(imageID),q1)\n",
+    "        imageLikes.insert(len(imageLikes),q2)\n",
+    "        sl_no.insert(len(sl_no),str(count))\n",
+    "        count = count + 1\n",
+    "        if count > max_no_of_iteration:\n",
+    "            driver.quit()\n",
+    "            df = pd.DataFrame({'ImageID':imageID,'Sl_no':sl_no, 'ImageLikes':imageLikes})\n",
+    "            fileName = instaaccountname+str('.csv')\n",
+    "            df.to_csv(fileName, index=False)\n",
+    "            break\n",
+    "\n",
+    "\n",
+    "    except:\n",
+    "        pass\n",
+    "\n",
+    "try:\n",
+    "    driver.quit()\n",
+    "except:\n",
+    "    pass"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/insta_image_saving/instructions.txt b/insta_image_saving/instructions.txt
@@ -0,0 +1,7 @@
+INSTRUCTIONS FOR SUCCESSFUL OPERATION :
+
+--> hardcode variable "instaccountlink" with desired instagram URL
+--> hardcode variable "instaaccountname" with desired name
+--> hardcode variable "file" with desired output name
+--> set condition for scrolling, i.e. variable "i" which is by default set to 300
+--> you may need to redefine HTML tags used, as these are regularly changed as per INSTAGRAM security concerns, the HTML tags used here is latest as per best of my knowledge.
diff --git a/insta_image_saving/readme.txt b/insta_image_saving/readme.txt
@@ -0,0 +1 @@
+THIS A JUPYTER NOTEBOOK THAT CAN SAVE ALL OR SOME IMAGES FROM A PUBLIC INSTAGRAM URL AS PER USER'S DIRECTIVES.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+THIS A JUPYTER NOTEBOOK THAT CAN SAVE ALL OR SOME IMAGES FROM A PUBLIC INSTAGRAM URL AS PER USER'S DIRECTIVES.`