initial commit

This commit is contained in:
Sebastian Hugentobler 2015-03-12 15:56:56 +01:00
commit fc83ea25a9
6 changed files with 861 additions and 0 deletions

3
.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
build/
node_modules/
.DS_Store

23
Gruntfile.js Normal file
View File

@ -0,0 +1,23 @@
module.exports = function(grunt) {
grunt.initConfig({
pkg: grunt.file.readJSON( 'package.json' ),
run: {
saver: {
cmd: 'casperjs',
args: [
'src/proboard_saver.coffee',
'--board-nr=YOUR-BOARD-NR',
'--board-name=YOUR-BOARD-NAME',
'--user=YOUR-USERNAME',
'--password=YOUR-PASSWORD'
]
}
}
});
grunt.loadNpmTasks('grunt-run');
grunt.registerTask('default', ['run:saver']);
grunt.registerTask('save', ['run:saver']);
};

13
README.md Normal file
View File

@ -0,0 +1,13 @@
This tool tries to scrap all the accessible data from a [Proboards Forum](https://proboards.com/).
I wrote this because I wanted to export at least the text data for an old
board hosted there which some friends and I were using years ago. As it turned
out you simply can't do that. So here we are.
The program probably does some horrible things and I can't say if it will work
for every theme. But hey, it only has to work one time to get at the data.
It is able to detect attachments and images and tries to download them too.
A working [casperjs](http://casperjs.org/) installation is needed for the stuff to work.

15
package.json Normal file
View File

@ -0,0 +1,15 @@
{
"name": "proboard_saver",
"version": "0.0.1",
"description": "",
"main": "src/proboard_saver.coffee",
"dependencies": {
"grunt": "~0.4.2",
"grunt-run": "~0.2.1"
},
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "Sebastian Hugentobler",
"license": "CC0"
}

377
src/proboard_saver.coffee Normal file
View File

@ -0,0 +1,377 @@
# 2015 by Sebastian Hugentobler <shugentobler@vanwa.ch>
# To the extent possible under law, the author(s) have dedicated all copyright
# and related and neighboring rights to this software to the public domain
# worldwide. This software is distributed without any warranty.
# See http://creativecommons.org/publicdomain/zero/1.0/ for a description of CC0.
casper = require('casper').create(
verbose: false
logLevel: 'info'
pageSettings: {
webSecurityEnabled: false
}
)
utils = require('utils')
fs = require('fs')
casper.on 'error', (msg, trace) ->
@echo "Error: #{msg}", "ERROR"
casper.on 'page.error', (msg, trace) ->
@echo "Error: #{msg}", "ERROR"
casper.on 'remote.message', (msg, trace) ->
if not /Unsafe JavaScript attempt/.test msg
@echo "remote log: #{msg}", "INFO"
loadImages = (searchString) ->
images = searchString.match /\[img\](.*?)\[\/img\]/g
if images
for image in images
detailImage = image.match(/\[img\](.*?)\[\/img\]/)[1]
imageParts = detailImage.split '/'
imageName = imageParts[imageParts.length - 1]
console.log "\t\tdownloading image '#{imageName}'..."
casper.download detailImage, "data/images/#{imageName}"
re = new RegExp("\\[img\\]#{detailImage}\\[/img\\]", "i")
searchString = searchString.replace re, "[img]{{baseurl}}/images/#{imageName}[/img]"
return searchString
replaceHtml = (element) ->
images = Array::map.call element.querySelectorAll('img'), (img) ->
src: img.src, alt: if img.hasAttribute('alt') then img.alt else ''
for image in images
element.innerHTML = element.innerHTML.replace /<img[^>]*>/, "[img#{if image.alt then '=' + image.alt else ''}]#{image.src}[/img]"
videos = Array::map.call element.querySelectorAll("iframe[title='YouTube video player']"), (video) -> video.src.split('/')[4].split('?')[0]
for video in videos
element.innerHTML = element.innerHTML.replace /<iframe title="YouTube video player"[^>]*>.*?<\/iframe>/, "[video]https://www.youtube.com/watch?v=#{video}[/video]"
cursiveElements = Array::map.call element.querySelectorAll('i'), (cursive) -> cursive.innerHTML
for cursive in cursiveElements
element.innerHTML = element.innerHTML.replace /<i>.*?<\/i>/, "[i]#{cursive}[/i]"
boldElements = Array::map.call element.querySelectorAll('b'), (bold) -> bold.innerHTML
for bold in boldElements
element.innerHTML = element.innerHTML.replace /<b>.*?<\/b>/, "[b]#{bold}[/b]"
underlinedElements = Array::map.call element.querySelectorAll('u'), (underlined) -> underlined.innerHTML
for underlined in underlinedElements
element.innerHTML = element.innerHTML.replace /<u>.*?<\/u>/, "[u]#{underlined}[/u]"
colourElements = Array::map.call element.querySelectorAll('font[color]'), (colour) -> name: colour.attributes['color'].value.toLowerCase(), innerHTML: colour.innerHTML
for colour in colourElements
element.innerHTML = element.innerHTML.replace /<font color=".*">[^<\/font>]*<\/font>/, "[colour=#{colour.name}]#{colour.innerHTML}[/colour]"
quote = element.querySelector 'div.quote_body'
while quote
quoteHeaderNode = quote.querySelector 'div.quote_header'
registeredUserNode = if quoteHeaderNode then quote.querySelector('div.quote_header').querySelector('span[itemprop="name"]') else null
user = null
if registeredUserNode
user = registeredUserNode.textContent
else if quote.parentNode.attributes['author']
user = quote.parentNode.attributes['author'].value
if user.substr(0, 1) == '@'
user = user.substr 1
quoteHeader = quote.querySelector 'div.quote_header'
if quoteHeader then quoteHeader.parentNode.removeChild quoteHeader
quoteAvatar = quote.querySelector 'div.quote_avatar_container'
if quoteAvatar then quoteAvatar.parentNode.removeChild quoteAvatar
quoteClear = quote.querySelector 'div.quote_clear'
if quoteClear then quoteClear.parentNode.removeChild quoteClear
message = quote.innerHTML
dummySpan = document.createElement 'span'
dummySpan.setAttribute 'class', 'dummytag'
dummySpan.innerHTML = "[quote#{if user then '=' + user else ''}]#{message}[/quote]"
quote.parentNode.parentNode.replaceChild dummySpan, quote.parentNode
quote = element.querySelector 'div.quote_body'
dummyElements = Array::map.call element.querySelectorAll('span.dummytag'), (dummy) -> dummy.innerHTML
for dummyContent in dummyElements
element.innerHTML = element.innerHTML.replace /<span class="dummytag">.*<\/span>/, dummyContent
linkElements = Array::map.call element.querySelectorAll('a[href]'), (link) -> target: link.attributes['href'].value, name: link.innerText
for link in linkElements
element.innerHTML = element.innerHTML.replace /<a[^>]*>.*?<\/a>/, "[url=#{link.target}]#{link.name}[/url]"
element.innerHTML = element.innerHTML.replace /<font [^>]*>/g, ''
element.innerHTML = element.innerHTML.replace /<\/font>/g, ''
element.innerHTML = element.innerHTML.replace /<div class="quote_clear"><\/div>/g, ''
finalText = element.innerText
finalText = finalText.replace /<br>/g, '\n'
finalText = finalText.replace /\[img=([^\]]*)\]http:\/\/images\.proboards\.com\/v5\/images\/smiley\/.*?\[\/img\]/g, '$1'
finalText = finalText.replace /\[img=([^\]]*)\]http:\/\/images\.proboards\.com\/v5\/smiley\/.*?\[\/img\]/g, '$1'
attachmentIndex = finalText.indexOf('\n\n[b]Attachments:[/b]\n\n')
if attachmentIndex > -1
finalText = finalText.substring 0, attachmentIndex
return finalText
findBoards = ->
boards = document.querySelectorAll('tr.board.item td:nth-child(2) > span > a')
boardTitles = Array::map.call boards, (e) -> e.textContent
boardLinks = Array::map.call boards, (e) -> e.href
boardDescriptions = document.querySelectorAll('tr.board.item td:nth-child(2) > p.description')
boardDescriptionList = Array::map.call boardDescriptions, (e) -> e.textContent
boardInfo = []
i = 0
while i < boardTitles.length
boardInfo.push
title: boardTitles[i]
description: boardDescriptionList[i]
link: boardLinks[i]
i++
return boardInfo
findPages = ->
shownPages = document.querySelectorAll('ul.ui-pagination > li.ui-pagination-page.ui-pagination-slot > a[href]')
lastPage = shownPages[shownPages.length - 1]
pageInfo = /(.*\?page=)(\d*)/.exec lastPage
pageBase = pageInfo[1]
maxPage = pageInfo[2]
pages = ("#{pageBase}#{pageNr}" for pageNr in [1..maxPage])
findThreads = ->
threads = document.querySelectorAll('tr.item.thread > td:nth-child(3) a.thread-link')
threadTitles = Array::map.call threads, (e) -> e.textContent
threadLinks = Array::map.call threads, (e) -> e.href
threadIds = Array::map.call threads, (e) ->
/.*\/thread\/(\d*)\/.*/.exec(e.href)[1]
threadInfo = []
i = 0
while i < threadTitles.length
threadInfo.push
id: threadIds[i]
title: threadTitles[i]
link: threadLinks[i]
i++
return threadInfo
findPosts = (replaceHtml) ->
postInfo = Array::map.call document.querySelectorAll('tr.item.post'), (e) ->
messageNode = e.querySelector('td.content div.message')
attachmentNodes = messageNode.querySelectorAll('div.post_attachments blockquote a')
dateNode = e.querySelector('td.content span.date > abbr.time')
userNode = e.querySelector('td.left-panel a.user-link,td.left-panel > div.mini-profile.guest-mini-profile')
id = /post-(\d*)/.exec(e.id)[1]
message = replaceHtml(messageNode)
attachments = []
for attachmentNode in attachmentNodes
attachmentName = attachmentNode.text
if attachmentNode.childElementCount > 0
attachmentName = attachmentNode.children[0].alt
attachments.push
name: attachmentName
url: attachmentNode.href
timestamp = parseInt(dateNode.attributes['data-timestamp'].value, 10) / 1000
user = { }
if userNode.href
linkSplit = userNode.href.split '/'
user = { link: linkSplit[linkSplit.length - 1], name: userNode.textContent }
else
user = { link: '', name: userNode.firstChild.data.replace '\n\t', '' }
return {
id: id,
message: message,
attachments: attachments,
timestamp: timestamp,
user: user
}
return postInfo
findUserLinks = ->
Array::map.call document.querySelectorAll('div.container.members a.user-link'), (e) -> e.href
getUser = (replaceHtml) ->
user = {}
user.name = document.querySelectorAll('span.big_username')[0].textContent
signatureNode = document.querySelector('td#center-column > div.content-box:last-child')
user.signature = ''
if signatureNode
user.signature = replaceHtml signatureNode
if not /Signature\n/.test user.signature
user.signature = ''
user.signature = user.signature.replace 'Signature\n', ''
statusNode = document.querySelectorAll('form.form_user_status div.content-box tr span.personal-text')
user.status = if statusNode.length > 0 then statusNode[0].textContent else ''
user.registered = parseInt(document.querySelectorAll('td#center-column > div.content-box abbr.time')[0].attributes['data-timestamp'].value, 10) / 1000
return user
missingArgumentError = (argument) ->
console.log "missing the #{ argument } argument"
casper.exit()
if casper.cli.options['board-nr']
proboardNr = casper.cli.options['board-nr']
else
missingArgumentError 'board-nr'
if casper.cli.options['board-name']
proboardName = casper.cli.options['board-name']
else
missingArgumentError 'board-name'
if casper.cli.options['user']
user = casper.cli.options['user']
else
missingArgumentError 'user'
if casper.cli.options['password']
password = casper.cli.options['password']
else
missingArgumentError 'password'
proboardUrl = "http://#{ proboardName }.proboards.com/"
proboardUserUrl = "#{ proboardUrl }members"
casper.userAgent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:30.0) Gecko/20100101 Firefox/30.0'
casper.start proboardUrl, ->
casper.thenOpen 'https://login.proboards.com/forum_submit/login',
method: 'post'
data:
forum: proboardNr
email: user
password: password
continue: 'Continue'
, ->
readBoard = (board) ->
casper.thenOpen board.link, ->
board.boards = @evaluate findBoards
@each board.boards, (casper, subboard) ->
readBoard subboard
@thenOpen board.link, ->
@echo "getting threads for board '#{ board.title }'..."
board.threads = []
boardPages = @evaluate findPages
@each boardPages, (casper, boardPage) ->
@thenOpen boardPage, ->
board.threads = board.threads.concat @evaluate findThreads
@then ->
@each board.threads, (casper, thread) ->
thread.posts = []
@thenOpen thread.link, ->
@echo "\tgetting posts for thread '#{ thread.title }'..."
pollName = null
if @exists 'div.poll.show.ui-poll'
console.log '\t\tsaving poll...'
linkParts = thread.link.split '/'
pollName = "#{linkParts[linkParts.length - 1]}.png"
@captureSelector "data/images/polls/#{pollName}", 'div.poll.show.ui-poll'
thread.poll = pollName
threadPages = @evaluate findPages
@each threadPages, (casper, threadPage) ->
@thenOpen threadPage, ->
posts = @evaluate findPosts, replaceHtml
@each posts, (casper, post) ->
post.message = loadImages post.message
for attachment in post.attachments
casper.download attachment.url, "data/attachments/#{attachment.name}"
attachment.url = "{{baseurl}}/attachments/#{attachment.name}"
thread.posts = thread.posts.concat posts
@then ->
if thread.poll and thread.posts[0]
thread.posts[0].message = "[img]{{baseurl}}/images/polls/#{thread.poll}[/img]\n\n#{thread.posts[0].message}"
if thread.poll and not thread.posts[0]
console.log "how the fuck did you manage that?"
proboard = {}
casper.thenOpen proboardUrl, ->
proboard.boards = @evaluate findBoards
@each proboard.boards, (casper, board) ->
readBoard board
casper.thenOpen proboardUserUrl, ->
proboard.users = []
userPages = @evaluate findPages
@each userPages, (casper, userPage) ->
@thenOpen userPage, ->
userlinks = @evaluate findUserLinks
@each userlinks, (casper, userlink) ->
@thenOpen userlink, ->
@echo "getting userinfo for '#{ userlink }'..."
user = @evaluate getUser, replaceHtml
user.signature = loadImages user.signature
proboard.users = proboard.users.concat user
casper.then ->
json = JSON.stringify(proboard, null, '\t')
fs.write "data/#{ proboardName }.json", json, 'w'
casper.run()

430
src/proboard_saver.js Normal file
View File

@ -0,0 +1,430 @@
// Generated by CoffeeScript 1.8.0
(function() {
var casper, findBoards, findPages, findPosts, findThreads, findUserLinks, fs, getUser, loadImages, missingArgumentError, password, proboard, proboardName, proboardNr, proboardUrl, proboardUserUrl, readBoard, replaceHtml, user, utils;
casper = require('casper').create({
verbose: false,
logLevel: 'info',
pageSettings: {
webSecurityEnabled: false
}
});
utils = require('utils');
fs = require('fs');
casper.on('error', function(msg, trace) {
return this.echo("Error: " + msg, "ERROR");
});
casper.on('page.error', function(msg, trace) {
return this.echo("Error: " + msg, "ERROR");
});
casper.on('remote.message', function(msg, trace) {
if (!/Unsafe JavaScript attempt/.test(msg)) {
return this.echo("remote log: " + msg, "INFO");
}
});
loadImages = function(searchString) {
var detailImage, image, imageName, imageParts, images, re, _i, _len;
images = searchString.match(/\[img\](.*?)\[\/img\]/g);
if (images) {
for (_i = 0, _len = images.length; _i < _len; _i++) {
image = images[_i];
detailImage = image.match(/\[img\](.*?)\[\/img\]/)[1];
imageParts = detailImage.split('/');
imageName = imageParts[imageParts.length - 1];
console.log("\t\tdownloading image '" + imageName + "'...");
casper.download(detailImage, "data/images/" + imageName);
re = new RegExp("\\[img\\]" + detailImage + "\\[/img\\]", "i");
searchString = searchString.replace(re, "[img]{{baseurl}}/images/" + imageName + "[/img]");
}
}
return searchString;
};
replaceHtml = function(element) {
var bold, boldElements, colour, colourElements, cursive, cursiveElements, dummyContent, dummyElements, dummySpan, finalText, image, images, link, linkElements, message, quote, quoteAvatar, quoteClear, quoteHeader, quoteHeaderNode, registeredUserNode, underlined, underlinedElements, user, video, videos, _i, _j, _k, _l, _len, _len1, _len2, _len3, _len4, _len5, _len6, _len7, _m, _n, _o, _p;
images = Array.prototype.map.call(element.querySelectorAll('img'), function(img) {
return {
src: img.src,
alt: img.hasAttribute('alt') ? img.alt : ''
};
});
for (_i = 0, _len = images.length; _i < _len; _i++) {
image = images[_i];
element.innerHTML = element.innerHTML.replace(/<img[^>]*>/, "[img" + (image.alt ? '=' + image.alt : '') + "]" + image.src + "[/img]");
}
videos = Array.prototype.map.call(element.querySelectorAll("iframe[title='YouTube video player']"), function(video) {
return video.src.split('/')[4].split('?')[0];
});
for (_j = 0, _len1 = videos.length; _j < _len1; _j++) {
video = videos[_j];
element.innerHTML = element.innerHTML.replace(/<iframe title="YouTube video player"[^>]*>.*?<\/iframe>/, "[video]https://www.youtube.com/watch?v=" + video + "[/video]");
}
cursiveElements = Array.prototype.map.call(element.querySelectorAll('i'), function(cursive) {
return cursive.innerHTML;
});
for (_k = 0, _len2 = cursiveElements.length; _k < _len2; _k++) {
cursive = cursiveElements[_k];
element.innerHTML = element.innerHTML.replace(/<i>.*?<\/i>/, "[i]" + cursive + "[/i]");
}
boldElements = Array.prototype.map.call(element.querySelectorAll('b'), function(bold) {
return bold.innerHTML;
});
for (_l = 0, _len3 = boldElements.length; _l < _len3; _l++) {
bold = boldElements[_l];
element.innerHTML = element.innerHTML.replace(/<b>.*?<\/b>/, "[b]" + bold + "[/b]");
}
underlinedElements = Array.prototype.map.call(element.querySelectorAll('u'), function(underlined) {
return underlined.innerHTML;
});
for (_m = 0, _len4 = underlinedElements.length; _m < _len4; _m++) {
underlined = underlinedElements[_m];
element.innerHTML = element.innerHTML.replace(/<u>.*?<\/u>/, "[u]" + underlined + "[/u]");
}
colourElements = Array.prototype.map.call(element.querySelectorAll('font[color]'), function(colour) {
return {
name: colour.attributes['color'].value.toLowerCase(),
innerHTML: colour.innerHTML
};
});
for (_n = 0, _len5 = colourElements.length; _n < _len5; _n++) {
colour = colourElements[_n];
element.innerHTML = element.innerHTML.replace(/<font color=".*">[^<\/font>]*<\/font>/, "[colour=" + colour.name + "]" + colour.innerHTML + "[/colour]");
}
quote = element.querySelector('div.quote_body');
while (quote) {
quoteHeaderNode = quote.querySelector('div.quote_header');
registeredUserNode = quoteHeaderNode ? quote.querySelector('div.quote_header').querySelector('span[itemprop="name"]') : null;
user = null;
if (registeredUserNode) {
user = registeredUserNode.textContent;
} else if (quote.parentNode.attributes['author']) {
user = quote.parentNode.attributes['author'].value;
if (user.substr(0, 1) === '@') {
user = user.substr(1);
}
}
quoteHeader = quote.querySelector('div.quote_header');
if (quoteHeader) {
quoteHeader.parentNode.removeChild(quoteHeader);
}
quoteAvatar = quote.querySelector('div.quote_avatar_container');
if (quoteAvatar) {
quoteAvatar.parentNode.removeChild(quoteAvatar);
}
quoteClear = quote.querySelector('div.quote_clear');
if (quoteClear) {
quoteClear.parentNode.removeChild(quoteClear);
}
message = quote.innerHTML;
dummySpan = document.createElement('span');
dummySpan.setAttribute('class', 'dummytag');
dummySpan.innerHTML = "[quote" + (user ? '=' + user : '') + "]" + message + "[/quote]";
quote.parentNode.parentNode.replaceChild(dummySpan, quote.parentNode);
quote = element.querySelector('div.quote_body');
}
dummyElements = Array.prototype.map.call(element.querySelectorAll('span.dummytag'), function(dummy) {
return dummy.innerHTML;
});
for (_o = 0, _len6 = dummyElements.length; _o < _len6; _o++) {
dummyContent = dummyElements[_o];
element.innerHTML = element.innerHTML.replace(/<span class="dummytag">.*<\/span>/, dummyContent);
}
linkElements = Array.prototype.map.call(element.querySelectorAll('a[href]'), function(link) {
return {
target: link.attributes['href'].value,
name: link.innerText
};
});
for (_p = 0, _len7 = linkElements.length; _p < _len7; _p++) {
link = linkElements[_p];
element.innerHTML = element.innerHTML.replace(/<a[^>]*>.*?<\/a>/, "[url=" + link.target + "]" + link.name + "[/url]");
}
element.innerHTML = element.innerHTML.replace(/<font [^>]*>/g, '');
element.innerHTML = element.innerHTML.replace(/<\/font>/g, '');
element.innerHTML = element.innerHTML.replace(/<div class="quote_clear"><\/div>/g, '');
finalText = element.innerText;
finalText = finalText.replace(/<br>/g, '\n');
finalText = finalText.replace(/\[img=([^\]]*)\]http:\/\/images\.proboards\.com\/v5\/images\/smiley\/.*?\[\/img\]/g, '$1');
finalText = finalText.replace(/\[img=([^\]]*)\]http:\/\/images\.proboards\.com\/v5\/smiley\/.*?\[\/img\]/g, '$1');
return finalText;
};
findBoards = function() {
var boardDescriptionList, boardDescriptions, boardInfo, boardLinks, boardTitles, boards, i;
boards = document.querySelectorAll('tr.board.item td:nth-child(2) > span > a');
boardTitles = Array.prototype.map.call(boards, function(e) {
return e.textContent;
});
boardLinks = Array.prototype.map.call(boards, function(e) {
return e.href;
});
boardDescriptions = document.querySelectorAll('tr.board.item td:nth-child(2) > p.description');
boardDescriptionList = Array.prototype.map.call(boardDescriptions, function(e) {
return e.textContent;
});
boardInfo = [];
i = 0;
while (i < boardTitles.length) {
boardInfo.push({
title: boardTitles[i],
description: boardDescriptionList[i],
link: boardLinks[i]
});
i++;
}
return boardInfo;
};
findPages = function() {
var lastPage, maxPage, pageBase, pageInfo, pageNr, pages, shownPages;
shownPages = document.querySelectorAll('ul.ui-pagination > li.ui-pagination-page.ui-pagination-slot > a[href]');
lastPage = shownPages[shownPages.length - 1];
pageInfo = /(.*\?page=)(\d*)/.exec(lastPage);
pageBase = pageInfo[1];
maxPage = pageInfo[2];
return pages = (function() {
var _i, _results;
_results = [];
for (pageNr = _i = 1; 1 <= maxPage ? _i <= maxPage : _i >= maxPage; pageNr = 1 <= maxPage ? ++_i : --_i) {
_results.push("" + pageBase + pageNr);
}
return _results;
})();
};
findThreads = function() {
var i, threadIds, threadInfo, threadLinks, threadTitles, threads;
threads = document.querySelectorAll('tr.item.thread > td:nth-child(3) a.thread-link');
threadTitles = Array.prototype.map.call(threads, function(e) {
return e.textContent;
});
threadLinks = Array.prototype.map.call(threads, function(e) {
return e.href;
});
threadIds = Array.prototype.map.call(threads, function(e) {
return /.*\/thread\/(\d*)\/.*/.exec(e.href)[1];
});
threadInfo = [];
i = 0;
while (i < threadTitles.length) {
threadInfo.push({
id: threadIds[i],
title: threadTitles[i],
link: threadLinks[i]
});
i++;
}
return threadInfo;
};
findPosts = function(replaceHtml) {
var postInfo;
postInfo = Array.prototype.map.call(document.querySelectorAll('tr.item.post'), function(e) {
var attachmentName, attachmentNode, attachmentNodes, attachments, dateNode, id, linkSplit, message, messageNode, timestamp, user, userNode, _i, _len;
messageNode = e.querySelector('td.content div.message');
attachmentNodes = messageNode.querySelectorAll('div.post_attachments blockquote a');
dateNode = e.querySelector('td.content span.date > abbr.time');
userNode = e.querySelector('td.left-panel a.user-link,td.left-panel > div.mini-profile.guest-mini-profile');
id = /post-(\d*)/.exec(e.id)[1];
message = replaceHtml(messageNode);
attachments = [];
for (_i = 0, _len = attachmentNodes.length; _i < _len; _i++) {
attachmentNode = attachmentNodes[_i];
attachmentName = attachmentNode.text;
if (attachmentNode.childElementCount > 0) {
attachmentName = attachmentNode.children[0].alt;
}
attachments.push({
name: attachmentName,
url: attachmentNode.href
});
}
timestamp = parseInt(dateNode.attributes['data-timestamp'].value, 10) / 1000;
user = {};
if (userNode.href) {
linkSplit = userNode.href.split('/');
user = {
link: linkSplit[linkSplit.length - 1],
name: userNode.textContent
};
} else {
user = {
link: '',
name: userNode.firstChild.data.replace('\n\t', '')
};
}
return {
id: id,
message: message,
attachments: attachments,
timestamp: timestamp,
user: user
};
});
return postInfo;
};
findUserLinks = function() {
return Array.prototype.map.call(document.querySelectorAll('div.container.members a.user-link'), function(e) {
return e.href;
});
};
getUser = function(replaceHtml) {
var signatureNode, statusNode, user;
user = {};
user.name = document.querySelectorAll('span.big_username')[0].textContent;
signatureNode = document.querySelector('td#center-column > div.content-box:last-child');
user.signature = '';
if (signatureNode) {
user.signature = replaceHtml(signatureNode);
if (!/Signature\n/.test(user.signature)) {
user.signature = '';
}
user.signature = user.signature.replace('Signature\n', '');
}
statusNode = document.querySelectorAll('form.form_user_status div.content-box tr span.personal-text');
user.status = statusNode.length > 0 ? statusNode[0].textContent : '';
user.registered = parseInt(document.querySelectorAll('td#center-column > div.content-box abbr.time')[0].attributes['data-timestamp'].value, 10) / 1000;
return user;
};
missingArgumentError = function(argument) {
console.log("missing the " + argument + " argument");
return casper.exit();
};
if (casper.cli.options['board-nr']) {
proboardNr = casper.cli.options['board-nr'];
} else {
missingArgumentError('board-nr');
}
if (casper.cli.options['board-name']) {
proboardName = casper.cli.options['board-name'];
} else {
missingArgumentError('board-name');
}
if (casper.cli.options['user']) {
user = casper.cli.options['user'];
} else {
missingArgumentError('user');
}
if (casper.cli.options['password']) {
password = casper.cli.options['password'];
} else {
missingArgumentError('password');
}
proboardUrl = "http://" + proboardName + ".proboards.com/";
proboardUserUrl = "" + proboardUrl + "members";
casper.userAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:30.0) Gecko/20100101 Firefox/30.0');
casper.start(proboardUrl, function() {});
casper.thenOpen('https://login.proboards.com/forum_submit/login', {
method: 'post',
data: {
forum: proboardNr,
email: user,
password: password,
"continue": 'Continue'
}
}, function() {});
readBoard = function(board) {
return casper.thenOpen(board.link, function() {
board.boards = this.evaluate(findBoards);
this.each(board.boards, function(casper, subboard) {
return readBoard(subboard);
});
return this.thenOpen(board.link, function() {
var boardPages;
this.echo("getting threads for board '" + board.title + "'...");
board.threads = [];
boardPages = this.evaluate(findPages);
this.each(boardPages, function(casper, boardPage) {
return this.thenOpen(boardPage, function() {
return board.threads = board.threads.concat(this.evaluate(findThreads));
});
});
return this.then(function() {
this.then(function() {
return board.threads = board.threads[0];
});
return this.each(board.threads, function(casper, thread) {
thread.posts = [];
return this.thenOpen(thread.link, function() {
var linkParts, pollName, threadPages;
this.echo("\tgetting posts for thread '" + thread.title + "'...");
pollName = null;
if (this.exists('div.poll.show.ui-poll')) {
console.log('\t\tsaving poll...');
linkParts = thread.link.split('/');
pollName = "" + linkParts[linkParts.length - 1] + ".png";
this.captureSelector("data/images/polls/" + pollName, 'div.poll.show.ui-poll');
}
thread.poll = pollName;
threadPages = this.evaluate(findPages);
this.each(threadPages, function(casper, threadPage) {
return this.thenOpen(threadPage, function() {
var posts;
posts = this.evaluate(findPosts, replaceHtml);
this.each(posts, function(casper, post) {
var attachment, _i, _len, _ref, _results;
post.message = loadImages(post.message.message);
utils.dump(post);
_ref = post.attachments;
_results = [];
for (_i = 0, _len = _ref.length; _i < _len; _i++) {
attachment = _ref[_i];
casper.download(attachment.url, "data/attachments/" + attachment.name);
_results.push(attachment.url = "{{baseurl}}/attachments/" + attachment.name);
}
return _results;
});
utils.dump(posts);
return thread.posts = thread.posts.concat(posts);
});
});
return this.then(function() {
if (thread.poll && thread.posts[0]) {
thread.posts[0].message = "[img]{{baseurl}}/images/polls/" + thread.poll + "[/img]\n\n" + thread.posts[0].message;
}
if (thread.poll && !thread.posts[0]) {
return console.log("how the fuck did you manage that?");
}
});
});
});
});
});
});
};
proboard = {};
casper.thenOpen(proboardUrl, function() {
proboard.boards = this.evaluate(findBoards);
return readBoard(proboard.boards[0]);
});
casper.then(function() {
var json;
json = JSON.stringify(proboard, null, '\t');
return fs.write("data/" + proboardName + ".json", json, 'w');
});
casper.run();
}).call(this);