In this article, we will learn how we can download Instagram posts of a profile using Python Selenium module.
Requirements:
- Google Chrome or Firefox
- Chrome driver(For Google Chrome) or Gecko driver(For Mozilla Firefox)
- Selenium package: It is a powerful tool for controlling a web browser through the program. It is functional for all browsers, works on all major OS and its scripts are written in various languages i.e Python, Java, C#, etc. In can installed using the below command:
pip install selenium
- Beautiful Soup package: It is a Python library for pulling data out of HTML and XML files. It works with your favorite parser to provide idiomatic ways of navigating, searching, and modifying the parse tree.It can installed using the below command:
pip install bs4
- Requests package: Requests library is one of an integral part of Python for making HTTP requests to a specified URL. It can be installed using the below command:
pip install requests
Step-by-step Approach:
Step 1: Importing modules and entering the login information along with the URL of the page.
Python3
# import required modules from selenium import webdriver from selenium.webdriver.common.keys import Keys import selenium.common.exceptions import time from bs4 import BeautifulSoup as bs import requests import os # get instagram account credentials username = input ( 'Enter Your User Name ' ) password = input ( 'Enter Your Password ' ) # assign URL input ( 'Enter User Name Of User For Downloading Posts ' ) |
Step 2: Function to start the new session of Browser. You might need to add the path to the web driver. Chrome() function, it depends on your installation.
Python3
# get URL path def path(): global chrome # starts a new chrome session # add path if required chrome = webdriver.Chrome() |
Step 3: Function to enter the URL of the page.
Python3
# extract URL def url_name(url): # the web page opens up chrome.get(url) # webdriver will wait for 4 sec before throwing a # NoSuchElement exception so that the element # is detected and not skipped. time.sleep( 4 ) |
Step 4: Function to enter your login information.
Python3
# login to access post def login(username, your_password): log_but = chrome.find_element_by_class_name( "L3NKy" ) time.sleep( 2 ) log_but.click() time.sleep( 4 ) # finds the username box usern = chrome.find_element_by_name( "username" ) # sends the entered username usern.send_keys(username) # finds the password box passw = chrome.find_element_by_name( "password" ) # sends the entered password passw.send_keys(your_password) # sends the enter key passw.send_keys(Keys.RETURN) time.sleep( 5.5 ) # Find Not Now Button notn = chrome.find_element_by_class_name( "yWX7d" ) notn.click() time.sleep( 3 ) |
Step 5: Function to open the first post.
Python3
# function to get first post def first_post(): pic = chrome.find_element_by_class_name( "kIKUG" ).click() time.sleep( 2 ) |
Step 6: Function to download all posts.
Python3
def download_allposts(): # open First Post first_post() user_name = url.split( '/' )[ - 1 ] # check if folder corresponding to user name exist or not if (os.path.isdir(user_name) = = False ): # Create folder os.mkdir(user_name) # Check if Posts contains multiple images or videos multiple_images = nested_check() if multiple_images: nescheck = multiple_images count_img = 0 while nescheck: elem_img = chrome.find_element_by_class_name( 'rQDP3' ) # Function to save nested images save_multiple(user_name + '/' + 'content1.' + str (count_img), elem_img) count_img + = 1 nescheck.click() nescheck = nested_check() # pass last_img_flag True save_multiple(user_name + '/' + 'content1.' + str (count_img), elem_img, last_img_flag = 1 ) else : save_content( '_97aPb' , user_name + '/' + 'content1' ) c = 2 while ( True ): next_el = next_post() if next_el ! = False : next_el.click() time.sleep( 1.3 ) try : multiple_images = nested_check() if multiple_images: nescheck = multiple_images count_img = 0 while nescheck: elem_img = chrome.find_element_by_class_name( 'rQDP3' ) save_multiple(user_name + '/' + 'content' + str (c) + '.' + str (count_img), elem_img) count_img + = 1 nescheck.click() nescheck = nested_check() save_multiple(user_name + '/' + 'content' + str (c) + '.' + str (count_img), elem_img, 1 ) else : save_content( '_97aPb' , user_name + '/' + 'content' + str (c)) except selenium.common.exceptions.NoSuchElementException: print ( "finished" ) return else : break c + = 1 |
Step 7: Function to click on next post.
Python3
# function to get next post def next_post(): try : nex = chrome.find_element_by_class_name( "coreSpriteRightPaginationArrow" ) return nex except selenium.common.exceptions.NoSuchElementException: return 0 |
Step 8: Function to save normal Posts.
Python3
# Function to save content of the current post def save_content(class_name,img_name): time.sleep( 0.5 ) try : pic = chrome.find_element_by_class_name(class_name) except selenium.common.exceptions.NoSuchElementException: print ( "Either This user has no images or you haven't followed this user or something went wrong" ) return html = pic.get_attribute( 'innerHTML' ) soup = bs(html, 'html.parser' ) link = soup.find( 'video' ) if link: link = link[ 'src' ] else : link = soup.find( 'img' )[ 'src' ] response = requests.get(link) with open (img_name, 'wb' ) as f: f.write(response.content) time.sleep( 0.9 ) |
Step 9: Function to save nested Posts.
Python3
# Function to save multiple posts def save_multiple(img_name,elem,last_img_flag = False ): time.sleep( 1 ) l = elem.get_attribute( 'innerHTML' ) html = bs(l, 'html.parser' ) biglist = html.find_all( 'ul' ) biglist = biglist[ 0 ] list_images = biglist.find_all( 'li' ) if last_img_flag: user_image = list_images[ - 1 ] else : user_image = list_images[( len (list_images) / / 2 )] video = user_image.find( 'video' ) if video: link = video[ 'src' ] else : link = user_image.find( 'img' )[ 'src' ] response = requests.get(link) with open (img_name, 'wb' ) as f: f.write(response.content) |
Step 10: Function to check if post is nested or not.
Python3
# function to check if the post is nested def nested_check(): try : time.sleep( 1 ) nes_nex = chrome.find_element_by_class_name( 'coreSpriteRightChevron ' ) return nes_nex except selenium.common.exceptions.NoSuchElementException: return 0 |
Step 11: Calling the required functions in the driver code.
Python3
# Driver Code path() time.sleep( 1 ) url_name(url) login(username, password) download_allposts() chrome.close() |
Below is the complete program based on the above approach:
Python3
# import required modules from selenium import webdriver from selenium.webdriver.common.keys import Keys import selenium.common.exceptions import time from bs4 import BeautifulSoup as bs import requests import os # get instagram account credentials username = input ( 'Enter Your User Name ' ) password = input ( 'Enter Your Password ' ) # assign URL input ( 'Enter User Name Of User For Downloading Posts ' ) # Get URL path def path(): global chrome # starts a new chrome session # add path if required chrome = webdriver.Chrome() # Extract URL def url_name(url): # the web page opens up chrome.get(url) # webdriver will wait for 4 sec before throwing a # NoSuchElement exception so that the element # is detected and not skipped. time.sleep( 4 ) # Login to access post def login(username, your_password): log_but = chrome.find_element_by_class_name( "L3NKy" ) time.sleep( 2 ) log_but.click() time.sleep( 4 ) # finds the username box usern = chrome.find_element_by_name( "username" ) # sends the entered username usern.send_keys(username) # finds the password box passw = chrome.find_element_by_name( "password" ) # sends the entered password passw.send_keys(your_password) # sends the enter key passw.send_keys(Keys.RETURN) time.sleep( 5.5 ) # Find Not Now Button notn = chrome.find_element_by_class_name( "yWX7d" ) notn.click() time.sleep( 3 ) # Function to get content of first post def first_post(): pic = chrome.find_element_by_class_name( "kIKUG" ).click() time.sleep( 2 ) # Function to get next post def next_post(): try : nex = chrome.find_element_by_class_name( "coreSpriteRightPaginationArrow" ) return nex except selenium.common.exceptions.NoSuchElementException: return 0 # Download content of all posts def download_allposts(): # open First Post first_post() user_name = url.split( '/' )[ - 1 ] # check if folder corresponding to user name exist or not if (os.path.isdir(user_name) = = False ): # Create folder os.mkdir(user_name) # Check if Posts contains multiple images or videos multiple_images = nested_check() if multiple_images: nescheck = multiple_images count_img = 0 while nescheck: elem_img = chrome.find_element_by_class_name( 'rQDP3' ) # Function to save nested images save_multiple(user_name + '/' + 'content1.' + str (count_img), elem_img) count_img + = 1 nescheck.click() nescheck = nested_check() # pass last_img_flag True save_multiple(user_name + '/' + 'content1.' + str (count_img), elem_img, last_img_flag = 1 ) else : save_content( '_97aPb' , user_name + '/' + 'content1' ) c = 2 while ( True ): next_el = next_post() if next_el ! = False : next_el.click() time.sleep( 1.3 ) try : multiple_images = nested_check() if multiple_images: nescheck = multiple_images count_img = 0 while nescheck: elem_img = chrome.find_element_by_class_name( 'rQDP3' ) save_multiple(user_name + '/' + 'content' + str (c) + '.' + str (count_img), elem_img) count_img + = 1 nescheck.click() nescheck = nested_check() save_multiple(user_name + '/' + 'content' + str (c) + '.' + str (count_img), elem_img, 1 ) else : save_content( '_97aPb' , user_name + '/' + 'content' + str (c)) except selenium.common.exceptions.NoSuchElementException: print ( "finished" ) return else : break c + = 1 # Function to save content of the current post def save_content(class_name, img_name): time.sleep( 0.5 ) try : pic = chrome.find_element_by_class_name(class_name) except selenium.common.exceptions.NoSuchElementException: print ( "Either This user has no images or you haven't followed this user or something went wrong" ) return html = pic.get_attribute( 'innerHTML' ) soup = bs(html, 'html.parser' ) link = soup.find( 'video' ) if link: link = link[ 'src' ] else : link = soup.find( 'img' )[ 'src' ] response = requests.get(link) with open (img_name, 'wb' ) as f: f.write(response.content) time.sleep( 0.9 ) # Function to save multiple posts def save_multiple(img_name, elem, last_img_flag = False ): time.sleep( 1 ) l = elem.get_attribute( 'innerHTML' ) html = bs(l, 'html.parser' ) biglist = html.find_all( 'ul' ) biglist = biglist[ 0 ] list_images = biglist.find_all( 'li' ) if last_img_flag: user_image = list_images[ - 1 ] else : user_image = list_images[( len (list_images) / / 2 )] video = user_image.find( 'video' ) if video: link = video[ 'src' ] else : link = user_image.find( 'img' )[ 'src' ] response = requests.get(link) with open (img_name, 'wb' ) as f: f.write(response.content) # Function to check if the post is nested def nested_check(): try : time.sleep( 1 ) nes_nex = chrome.find_element_by_class_name( 'coreSpriteRightChevron ' ) return nes_nex except selenium.common.exceptions.NoSuchElementException: return 0 # Driver Code path() time.sleep( 1 ) url_name(url) login(username, password) download_allposts() chrome.close() |
After running this complete script, a directory will be created which will contain all the posts.
Output:
Note: If you are Windows user then the posts will be saved with .file extension, open the posts with application which can open both images and videos(Instagram posts have only type of media, image or video)