The below nodejs code scrapes popular Images from an Instagram Page. The function 'ScrapeInstagramPage' takes care of post ageing effect.
var request = require('parse5');
var request = require('request');
var rp = require('request-promise');
var $ = require('cheerio'); // Basically jQuery for node.js
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
function ScrapeInstagramPage (args) {
dout("ScrapeInstagramPage for username -> " + args.username);
var query_url = 'https://www.instagram.com/' + args.username + '/';
var cookieString = '';
var options = {
url: query_url,
method: 'GET',
headers: {
'x-requested-with' : 'XMLHttpRequest',
'accept-language' : 'en-US,en;q=0.8,pt;q=0.6,hi;q=0.4',
'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'referer' : 'https://www.instagram.com/dress_blouse_designer/',
'Cookie' : cookieString,
'Accept' : '*/*',
'Connection' : 'keep-alive',
'authority' : 'www.instagram.com'
}
};
function dout (msg) {
if (args.debug) {
console.log(msg);
}
}
function autoParse(body, response, resolveWithFullResponse) {
// FIXME: The content type string could contain additional values like the charset.
// Consider using the `content-type` library for a robust comparison.
if (response.headers['content-type'] === 'application/json') {
return JSON.parse(body);
} else if (response.headers['content-type'] === 'text/html') {
return $.load(body);
} else {
return body;
}
}
options.transform = autoParse;
rp(options)
.then(function (autoParsedBody) {
if (args.debug) {
console.log("Responce of 'Get first user page': ");
console.log(autoParsedBody);
console.log("Creating JSDOM from above Responce...");
}
const dom = new JSDOM(autoParsedBody.html(), { runScripts: "dangerously" });
if (args.debug) console.log(dom.window._sharedData); // full data doc form instagram for a page
var user = dom.window._sharedData.entry_data.ProfilePage[0].user;
if (args.debug) {
console.log(user); // page user
console.log(user.id); // user ID
console.log(user.full_name); // user full_name
console.log(user.username); // user username
console.log(user.followed_by.count); // user followed_by
console.log(user.profile_pic_url_hd); // user profile pic
console.log(autoParsedBody.html());
}
if (user.is_private) {
dout ("User account is PRIVATE");
} else {
dout ("User account is public");
GetPostsFromUser(user.id, 5000, undefined);
}
})
.catch(function (err) {
console.log( "ERROR: " + err );
});
var pop_posts = [];
function GetPostsFromUser (user_id, first, end_cursor) {
var end_cursor_str = "";
if (end_cursor != undefined) {
end_cursor_str = '&after=' + end_cursor;
}
options.url = 'https://www.instagram.com/graphql/query/?query_id=17880160963012870&id='
+ user_id + '&first=' + first + end_cursor_str;
rp(options)
.then(function (autoParsedBody) {
if (autoParsedBody.status === "ok") {
if (args.debug) console.log(autoParsedBody.data);
var posts = autoParsedBody.data.user.edge_owner_to_timeline_media;
// POSTS processing
if (posts.edges.length > 0) {
//console.log(posts.edges);
pop_posts = pop_posts.concat
(posts.edges.map(function(e) {
var d = new Date();
var now_seconds = d.getTime() / 1000;
var seconds_since_post = now_seconds - e.node.taken_at_timestamp;
//console.log("seconds_since_post: " + seconds_since_post);
var ageing = 10; // valuses (1-10]; big value means no ageing
var days_since_post = Math.floor(seconds_since_post/(24*60*60));
var df = (Math.log(ageing+days_since_post) / (Math.log(ageing)));
var likes_per_day = (e.node.edge_liked_by.count / df);
// console.log("likes: " + e.node.edge_liked_by.count);
//console.log("df: " + df);
//console.log("likes_per_day: " + likes_per_day);
//return (likes_per_day > 10 * 1000);
var obj = {};
obj.url = e.node.display_url;
obj.likes_per_day = likes_per_day;
obj.days_since_post = days_since_post;
obj.total_likes = e.node.edge_liked_by.count;
return obj;
}
));
pop_posts.sort(function (b,a) {
if (a.likes_per_day < b.likes_per_day)
return -1;
if (a.likes_per_day > b.likes_per_day)
return 1;
return 0;
});
//console.log(pop_posts);
pop_posts.forEach(function (obj) {
console.log(obj.url);
});
}
if (posts.page_info.has_next_page) {
GetPostsFromUser(user_id, first, posts.page_info.end_cursor);
}
} else {
console.log( "ERROR: Posts AJAX call not returned good..." );
}
})
.catch(function (err) {
console.log( "ERROR: " + err );
});
}
}
ScrapeInstagramPage ({username : "dress_blouse_designer", debug : false});
Try it here
Example: For given a URL 'https://www.instagram.com/dress_blouse_designer/' one may call function
ScrapeInstagramPage ({username : "dress_blouse_designer", debug : false});