initial commit
This commit is contained in:
commit
fc83ea25a9
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
build/
|
||||||
|
node_modules/
|
||||||
|
.DS_Store
|
23
Gruntfile.js
Normal file
23
Gruntfile.js
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
module.exports = function(grunt) {
|
||||||
|
grunt.initConfig({
|
||||||
|
pkg: grunt.file.readJSON( 'package.json' ),
|
||||||
|
|
||||||
|
run: {
|
||||||
|
saver: {
|
||||||
|
cmd: 'casperjs',
|
||||||
|
args: [
|
||||||
|
'src/proboard_saver.coffee',
|
||||||
|
'--board-nr=YOUR-BOARD-NR',
|
||||||
|
'--board-name=YOUR-BOARD-NAME',
|
||||||
|
'--user=YOUR-USERNAME',
|
||||||
|
'--password=YOUR-PASSWORD'
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
grunt.loadNpmTasks('grunt-run');
|
||||||
|
|
||||||
|
grunt.registerTask('default', ['run:saver']);
|
||||||
|
grunt.registerTask('save', ['run:saver']);
|
||||||
|
};
|
13
README.md
Normal file
13
README.md
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
This tool tries to scrap all the accessible data from a [Proboards Forum](https://proboards.com/).
|
||||||
|
|
||||||
|
I wrote this because I wanted to export at least the text data for an old
|
||||||
|
board hosted there which some friends and I were using years ago. As it turned
|
||||||
|
out you simply can't do that. So here we are.
|
||||||
|
|
||||||
|
The program probably does some horrible things and I can't say if it will work
|
||||||
|
for every theme. But hey, it only has to work one time to get at the data.
|
||||||
|
|
||||||
|
It is able to detect attachments and images and tries to download them too.
|
||||||
|
|
||||||
|
A working [casperjs](http://casperjs.org/) installation is needed for the stuff to work.
|
||||||
|
|
15
package.json
Normal file
15
package.json
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
{
|
||||||
|
"name": "proboard_saver",
|
||||||
|
"version": "0.0.1",
|
||||||
|
"description": "",
|
||||||
|
"main": "src/proboard_saver.coffee",
|
||||||
|
"dependencies": {
|
||||||
|
"grunt": "~0.4.2",
|
||||||
|
"grunt-run": "~0.2.1"
|
||||||
|
},
|
||||||
|
"scripts": {
|
||||||
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
|
},
|
||||||
|
"author": "Sebastian Hugentobler",
|
||||||
|
"license": "CC0"
|
||||||
|
}
|
377
src/proboard_saver.coffee
Normal file
377
src/proboard_saver.coffee
Normal file
@ -0,0 +1,377 @@
|
|||||||
|
# 2015 by Sebastian Hugentobler <shugentobler@vanwa.ch>
|
||||||
|
# To the extent possible under law, the author(s) have dedicated all copyright
|
||||||
|
# and related and neighboring rights to this software to the public domain
|
||||||
|
# worldwide. This software is distributed without any warranty.
|
||||||
|
# See http://creativecommons.org/publicdomain/zero/1.0/ for a description of CC0.
|
||||||
|
|
||||||
|
casper = require('casper').create(
|
||||||
|
verbose: false
|
||||||
|
logLevel: 'info'
|
||||||
|
pageSettings: {
|
||||||
|
webSecurityEnabled: false
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
utils = require('utils')
|
||||||
|
fs = require('fs')
|
||||||
|
|
||||||
|
casper.on 'error', (msg, trace) ->
|
||||||
|
@echo "Error: #{msg}", "ERROR"
|
||||||
|
|
||||||
|
casper.on 'page.error', (msg, trace) ->
|
||||||
|
@echo "Error: #{msg}", "ERROR"
|
||||||
|
|
||||||
|
casper.on 'remote.message', (msg, trace) ->
|
||||||
|
if not /Unsafe JavaScript attempt/.test msg
|
||||||
|
@echo "remote log: #{msg}", "INFO"
|
||||||
|
|
||||||
|
loadImages = (searchString) ->
|
||||||
|
images = searchString.match /\[img\](.*?)\[\/img\]/g
|
||||||
|
if images
|
||||||
|
for image in images
|
||||||
|
detailImage = image.match(/\[img\](.*?)\[\/img\]/)[1]
|
||||||
|
|
||||||
|
imageParts = detailImage.split '/'
|
||||||
|
imageName = imageParts[imageParts.length - 1]
|
||||||
|
console.log "\t\tdownloading image '#{imageName}'..."
|
||||||
|
|
||||||
|
casper.download detailImage, "data/images/#{imageName}"
|
||||||
|
|
||||||
|
re = new RegExp("\\[img\\]#{detailImage}\\[/img\\]", "i")
|
||||||
|
searchString = searchString.replace re, "[img]{{baseurl}}/images/#{imageName}[/img]"
|
||||||
|
|
||||||
|
return searchString
|
||||||
|
|
||||||
|
replaceHtml = (element) ->
|
||||||
|
images = Array::map.call element.querySelectorAll('img'), (img) ->
|
||||||
|
src: img.src, alt: if img.hasAttribute('alt') then img.alt else ''
|
||||||
|
|
||||||
|
for image in images
|
||||||
|
element.innerHTML = element.innerHTML.replace /<img[^>]*>/, "[img#{if image.alt then '=' + image.alt else ''}]#{image.src}[/img]"
|
||||||
|
|
||||||
|
videos = Array::map.call element.querySelectorAll("iframe[title='YouTube video player']"), (video) -> video.src.split('/')[4].split('?')[0]
|
||||||
|
for video in videos
|
||||||
|
element.innerHTML = element.innerHTML.replace /<iframe title="YouTube video player"[^>]*>.*?<\/iframe>/, "[video]https://www.youtube.com/watch?v=#{video}[/video]"
|
||||||
|
|
||||||
|
cursiveElements = Array::map.call element.querySelectorAll('i'), (cursive) -> cursive.innerHTML
|
||||||
|
for cursive in cursiveElements
|
||||||
|
element.innerHTML = element.innerHTML.replace /<i>.*?<\/i>/, "[i]#{cursive}[/i]"
|
||||||
|
|
||||||
|
boldElements = Array::map.call element.querySelectorAll('b'), (bold) -> bold.innerHTML
|
||||||
|
for bold in boldElements
|
||||||
|
element.innerHTML = element.innerHTML.replace /<b>.*?<\/b>/, "[b]#{bold}[/b]"
|
||||||
|
|
||||||
|
underlinedElements = Array::map.call element.querySelectorAll('u'), (underlined) -> underlined.innerHTML
|
||||||
|
for underlined in underlinedElements
|
||||||
|
element.innerHTML = element.innerHTML.replace /<u>.*?<\/u>/, "[u]#{underlined}[/u]"
|
||||||
|
|
||||||
|
colourElements = Array::map.call element.querySelectorAll('font[color]'), (colour) -> name: colour.attributes['color'].value.toLowerCase(), innerHTML: colour.innerHTML
|
||||||
|
for colour in colourElements
|
||||||
|
element.innerHTML = element.innerHTML.replace /<font color=".*">[^<\/font>]*<\/font>/, "[colour=#{colour.name}]#{colour.innerHTML}[/colour]"
|
||||||
|
|
||||||
|
quote = element.querySelector 'div.quote_body'
|
||||||
|
while quote
|
||||||
|
quoteHeaderNode = quote.querySelector 'div.quote_header'
|
||||||
|
|
||||||
|
registeredUserNode = if quoteHeaderNode then quote.querySelector('div.quote_header').querySelector('span[itemprop="name"]') else null
|
||||||
|
|
||||||
|
user = null
|
||||||
|
|
||||||
|
if registeredUserNode
|
||||||
|
user = registeredUserNode.textContent
|
||||||
|
else if quote.parentNode.attributes['author']
|
||||||
|
user = quote.parentNode.attributes['author'].value
|
||||||
|
if user.substr(0, 1) == '@'
|
||||||
|
user = user.substr 1
|
||||||
|
|
||||||
|
quoteHeader = quote.querySelector 'div.quote_header'
|
||||||
|
if quoteHeader then quoteHeader.parentNode.removeChild quoteHeader
|
||||||
|
|
||||||
|
quoteAvatar = quote.querySelector 'div.quote_avatar_container'
|
||||||
|
if quoteAvatar then quoteAvatar.parentNode.removeChild quoteAvatar
|
||||||
|
|
||||||
|
quoteClear = quote.querySelector 'div.quote_clear'
|
||||||
|
if quoteClear then quoteClear.parentNode.removeChild quoteClear
|
||||||
|
|
||||||
|
message = quote.innerHTML
|
||||||
|
|
||||||
|
dummySpan = document.createElement 'span'
|
||||||
|
dummySpan.setAttribute 'class', 'dummytag'
|
||||||
|
dummySpan.innerHTML = "[quote#{if user then '=' + user else ''}]#{message}[/quote]"
|
||||||
|
|
||||||
|
quote.parentNode.parentNode.replaceChild dummySpan, quote.parentNode
|
||||||
|
|
||||||
|
quote = element.querySelector 'div.quote_body'
|
||||||
|
|
||||||
|
dummyElements = Array::map.call element.querySelectorAll('span.dummytag'), (dummy) -> dummy.innerHTML
|
||||||
|
for dummyContent in dummyElements
|
||||||
|
element.innerHTML = element.innerHTML.replace /<span class="dummytag">.*<\/span>/, dummyContent
|
||||||
|
|
||||||
|
linkElements = Array::map.call element.querySelectorAll('a[href]'), (link) -> target: link.attributes['href'].value, name: link.innerText
|
||||||
|
for link in linkElements
|
||||||
|
element.innerHTML = element.innerHTML.replace /<a[^>]*>.*?<\/a>/, "[url=#{link.target}]#{link.name}[/url]"
|
||||||
|
|
||||||
|
element.innerHTML = element.innerHTML.replace /<font [^>]*>/g, ''
|
||||||
|
element.innerHTML = element.innerHTML.replace /<\/font>/g, ''
|
||||||
|
element.innerHTML = element.innerHTML.replace /<div class="quote_clear"><\/div>/g, ''
|
||||||
|
|
||||||
|
finalText = element.innerText
|
||||||
|
finalText = finalText.replace /<br>/g, '\n'
|
||||||
|
finalText = finalText.replace /\[img=([^\]]*)\]http:\/\/images\.proboards\.com\/v5\/images\/smiley\/.*?\[\/img\]/g, '$1'
|
||||||
|
finalText = finalText.replace /\[img=([^\]]*)\]http:\/\/images\.proboards\.com\/v5\/smiley\/.*?\[\/img\]/g, '$1'
|
||||||
|
|
||||||
|
attachmentIndex = finalText.indexOf('\n\n[b]Attachments:[/b]\n\n')
|
||||||
|
if attachmentIndex > -1
|
||||||
|
finalText = finalText.substring 0, attachmentIndex
|
||||||
|
|
||||||
|
return finalText
|
||||||
|
|
||||||
|
findBoards = ->
|
||||||
|
boards = document.querySelectorAll('tr.board.item td:nth-child(2) > span > a')
|
||||||
|
|
||||||
|
boardTitles = Array::map.call boards, (e) -> e.textContent
|
||||||
|
boardLinks = Array::map.call boards, (e) -> e.href
|
||||||
|
|
||||||
|
boardDescriptions = document.querySelectorAll('tr.board.item td:nth-child(2) > p.description')
|
||||||
|
boardDescriptionList = Array::map.call boardDescriptions, (e) -> e.textContent
|
||||||
|
|
||||||
|
boardInfo = []
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while i < boardTitles.length
|
||||||
|
boardInfo.push
|
||||||
|
title: boardTitles[i]
|
||||||
|
description: boardDescriptionList[i]
|
||||||
|
link: boardLinks[i]
|
||||||
|
|
||||||
|
i++
|
||||||
|
|
||||||
|
return boardInfo
|
||||||
|
|
||||||
|
findPages = ->
|
||||||
|
shownPages = document.querySelectorAll('ul.ui-pagination > li.ui-pagination-page.ui-pagination-slot > a[href]')
|
||||||
|
lastPage = shownPages[shownPages.length - 1]
|
||||||
|
|
||||||
|
pageInfo = /(.*\?page=)(\d*)/.exec lastPage
|
||||||
|
pageBase = pageInfo[1]
|
||||||
|
maxPage = pageInfo[2]
|
||||||
|
|
||||||
|
pages = ("#{pageBase}#{pageNr}" for pageNr in [1..maxPage])
|
||||||
|
|
||||||
|
findThreads = ->
|
||||||
|
threads = document.querySelectorAll('tr.item.thread > td:nth-child(3) a.thread-link')
|
||||||
|
|
||||||
|
threadTitles = Array::map.call threads, (e) -> e.textContent
|
||||||
|
threadLinks = Array::map.call threads, (e) -> e.href
|
||||||
|
threadIds = Array::map.call threads, (e) ->
|
||||||
|
/.*\/thread\/(\d*)\/.*/.exec(e.href)[1]
|
||||||
|
|
||||||
|
threadInfo = []
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while i < threadTitles.length
|
||||||
|
threadInfo.push
|
||||||
|
id: threadIds[i]
|
||||||
|
title: threadTitles[i]
|
||||||
|
link: threadLinks[i]
|
||||||
|
|
||||||
|
i++
|
||||||
|
|
||||||
|
return threadInfo
|
||||||
|
|
||||||
|
findPosts = (replaceHtml) ->
|
||||||
|
postInfo = Array::map.call document.querySelectorAll('tr.item.post'), (e) ->
|
||||||
|
messageNode = e.querySelector('td.content div.message')
|
||||||
|
attachmentNodes = messageNode.querySelectorAll('div.post_attachments blockquote a')
|
||||||
|
dateNode = e.querySelector('td.content span.date > abbr.time')
|
||||||
|
userNode = e.querySelector('td.left-panel a.user-link,td.left-panel > div.mini-profile.guest-mini-profile')
|
||||||
|
|
||||||
|
id = /post-(\d*)/.exec(e.id)[1]
|
||||||
|
message = replaceHtml(messageNode)
|
||||||
|
|
||||||
|
attachments = []
|
||||||
|
for attachmentNode in attachmentNodes
|
||||||
|
attachmentName = attachmentNode.text
|
||||||
|
if attachmentNode.childElementCount > 0
|
||||||
|
attachmentName = attachmentNode.children[0].alt
|
||||||
|
|
||||||
|
attachments.push
|
||||||
|
name: attachmentName
|
||||||
|
url: attachmentNode.href
|
||||||
|
|
||||||
|
timestamp = parseInt(dateNode.attributes['data-timestamp'].value, 10) / 1000
|
||||||
|
|
||||||
|
user = { }
|
||||||
|
if userNode.href
|
||||||
|
linkSplit = userNode.href.split '/'
|
||||||
|
user = { link: linkSplit[linkSplit.length - 1], name: userNode.textContent }
|
||||||
|
else
|
||||||
|
user = { link: '', name: userNode.firstChild.data.replace '\n\t', '' }
|
||||||
|
|
||||||
|
return {
|
||||||
|
id: id,
|
||||||
|
message: message,
|
||||||
|
attachments: attachments,
|
||||||
|
timestamp: timestamp,
|
||||||
|
user: user
|
||||||
|
}
|
||||||
|
|
||||||
|
return postInfo
|
||||||
|
|
||||||
|
findUserLinks = ->
|
||||||
|
Array::map.call document.querySelectorAll('div.container.members a.user-link'), (e) -> e.href
|
||||||
|
|
||||||
|
getUser = (replaceHtml) ->
|
||||||
|
user = {}
|
||||||
|
|
||||||
|
user.name = document.querySelectorAll('span.big_username')[0].textContent
|
||||||
|
|
||||||
|
signatureNode = document.querySelector('td#center-column > div.content-box:last-child')
|
||||||
|
|
||||||
|
user.signature = ''
|
||||||
|
|
||||||
|
if signatureNode
|
||||||
|
user.signature = replaceHtml signatureNode
|
||||||
|
|
||||||
|
if not /Signature\n/.test user.signature
|
||||||
|
user.signature = ''
|
||||||
|
|
||||||
|
user.signature = user.signature.replace 'Signature\n', ''
|
||||||
|
|
||||||
|
statusNode = document.querySelectorAll('form.form_user_status div.content-box tr span.personal-text')
|
||||||
|
user.status = if statusNode.length > 0 then statusNode[0].textContent else ''
|
||||||
|
|
||||||
|
user.registered = parseInt(document.querySelectorAll('td#center-column > div.content-box abbr.time')[0].attributes['data-timestamp'].value, 10) / 1000
|
||||||
|
|
||||||
|
return user
|
||||||
|
|
||||||
|
missingArgumentError = (argument) ->
|
||||||
|
console.log "missing the #{ argument } argument"
|
||||||
|
casper.exit()
|
||||||
|
|
||||||
|
if casper.cli.options['board-nr']
|
||||||
|
proboardNr = casper.cli.options['board-nr']
|
||||||
|
else
|
||||||
|
missingArgumentError 'board-nr'
|
||||||
|
|
||||||
|
if casper.cli.options['board-name']
|
||||||
|
proboardName = casper.cli.options['board-name']
|
||||||
|
else
|
||||||
|
missingArgumentError 'board-name'
|
||||||
|
|
||||||
|
if casper.cli.options['user']
|
||||||
|
user = casper.cli.options['user']
|
||||||
|
else
|
||||||
|
missingArgumentError 'user'
|
||||||
|
|
||||||
|
if casper.cli.options['password']
|
||||||
|
password = casper.cli.options['password']
|
||||||
|
else
|
||||||
|
missingArgumentError 'password'
|
||||||
|
|
||||||
|
proboardUrl = "http://#{ proboardName }.proboards.com/"
|
||||||
|
proboardUserUrl = "#{ proboardUrl }members"
|
||||||
|
|
||||||
|
casper.userAgent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:30.0) Gecko/20100101 Firefox/30.0'
|
||||||
|
|
||||||
|
|
||||||
|
casper.start proboardUrl, ->
|
||||||
|
|
||||||
|
casper.thenOpen 'https://login.proboards.com/forum_submit/login',
|
||||||
|
method: 'post'
|
||||||
|
data:
|
||||||
|
forum: proboardNr
|
||||||
|
email: user
|
||||||
|
password: password
|
||||||
|
continue: 'Continue'
|
||||||
|
, ->
|
||||||
|
|
||||||
|
readBoard = (board) ->
|
||||||
|
casper.thenOpen board.link, ->
|
||||||
|
board.boards = @evaluate findBoards
|
||||||
|
|
||||||
|
@each board.boards, (casper, subboard) ->
|
||||||
|
readBoard subboard
|
||||||
|
|
||||||
|
@thenOpen board.link, ->
|
||||||
|
@echo "getting threads for board '#{ board.title }'..."
|
||||||
|
|
||||||
|
board.threads = []
|
||||||
|
|
||||||
|
boardPages = @evaluate findPages
|
||||||
|
|
||||||
|
@each boardPages, (casper, boardPage) ->
|
||||||
|
@thenOpen boardPage, ->
|
||||||
|
board.threads = board.threads.concat @evaluate findThreads
|
||||||
|
|
||||||
|
@then ->
|
||||||
|
@each board.threads, (casper, thread) ->
|
||||||
|
thread.posts = []
|
||||||
|
|
||||||
|
@thenOpen thread.link, ->
|
||||||
|
@echo "\tgetting posts for thread '#{ thread.title }'..."
|
||||||
|
|
||||||
|
pollName = null
|
||||||
|
if @exists 'div.poll.show.ui-poll'
|
||||||
|
console.log '\t\tsaving poll...'
|
||||||
|
|
||||||
|
linkParts = thread.link.split '/'
|
||||||
|
pollName = "#{linkParts[linkParts.length - 1]}.png"
|
||||||
|
|
||||||
|
@captureSelector "data/images/polls/#{pollName}", 'div.poll.show.ui-poll'
|
||||||
|
|
||||||
|
thread.poll = pollName
|
||||||
|
|
||||||
|
threadPages = @evaluate findPages
|
||||||
|
|
||||||
|
@each threadPages, (casper, threadPage) ->
|
||||||
|
@thenOpen threadPage, ->
|
||||||
|
posts = @evaluate findPosts, replaceHtml
|
||||||
|
|
||||||
|
@each posts, (casper, post) ->
|
||||||
|
post.message = loadImages post.message
|
||||||
|
|
||||||
|
for attachment in post.attachments
|
||||||
|
casper.download attachment.url, "data/attachments/#{attachment.name}"
|
||||||
|
attachment.url = "{{baseurl}}/attachments/#{attachment.name}"
|
||||||
|
|
||||||
|
thread.posts = thread.posts.concat posts
|
||||||
|
|
||||||
|
@then ->
|
||||||
|
if thread.poll and thread.posts[0]
|
||||||
|
thread.posts[0].message = "[img]{{baseurl}}/images/polls/#{thread.poll}[/img]\n\n#{thread.posts[0].message}"
|
||||||
|
|
||||||
|
if thread.poll and not thread.posts[0]
|
||||||
|
console.log "how the fuck did you manage that?"
|
||||||
|
|
||||||
|
proboard = {}
|
||||||
|
|
||||||
|
casper.thenOpen proboardUrl, ->
|
||||||
|
proboard.boards = @evaluate findBoards
|
||||||
|
|
||||||
|
@each proboard.boards, (casper, board) ->
|
||||||
|
readBoard board
|
||||||
|
|
||||||
|
casper.thenOpen proboardUserUrl, ->
|
||||||
|
proboard.users = []
|
||||||
|
|
||||||
|
userPages = @evaluate findPages
|
||||||
|
|
||||||
|
@each userPages, (casper, userPage) ->
|
||||||
|
@thenOpen userPage, ->
|
||||||
|
userlinks = @evaluate findUserLinks
|
||||||
|
|
||||||
|
@each userlinks, (casper, userlink) ->
|
||||||
|
@thenOpen userlink, ->
|
||||||
|
@echo "getting userinfo for '#{ userlink }'..."
|
||||||
|
|
||||||
|
user = @evaluate getUser, replaceHtml
|
||||||
|
user.signature = loadImages user.signature
|
||||||
|
|
||||||
|
proboard.users = proboard.users.concat user
|
||||||
|
|
||||||
|
casper.then ->
|
||||||
|
json = JSON.stringify(proboard, null, '\t')
|
||||||
|
fs.write "data/#{ proboardName }.json", json, 'w'
|
||||||
|
|
||||||
|
casper.run()
|
430
src/proboard_saver.js
Normal file
430
src/proboard_saver.js
Normal file
@ -0,0 +1,430 @@
|
|||||||
|
// Generated by CoffeeScript 1.8.0
|
||||||
|
(function() {
|
||||||
|
var casper, findBoards, findPages, findPosts, findThreads, findUserLinks, fs, getUser, loadImages, missingArgumentError, password, proboard, proboardName, proboardNr, proboardUrl, proboardUserUrl, readBoard, replaceHtml, user, utils;
|
||||||
|
|
||||||
|
casper = require('casper').create({
|
||||||
|
verbose: false,
|
||||||
|
logLevel: 'info',
|
||||||
|
pageSettings: {
|
||||||
|
webSecurityEnabled: false
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
utils = require('utils');
|
||||||
|
|
||||||
|
fs = require('fs');
|
||||||
|
|
||||||
|
casper.on('error', function(msg, trace) {
|
||||||
|
return this.echo("Error: " + msg, "ERROR");
|
||||||
|
});
|
||||||
|
|
||||||
|
casper.on('page.error', function(msg, trace) {
|
||||||
|
return this.echo("Error: " + msg, "ERROR");
|
||||||
|
});
|
||||||
|
|
||||||
|
casper.on('remote.message', function(msg, trace) {
|
||||||
|
if (!/Unsafe JavaScript attempt/.test(msg)) {
|
||||||
|
return this.echo("remote log: " + msg, "INFO");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
loadImages = function(searchString) {
|
||||||
|
var detailImage, image, imageName, imageParts, images, re, _i, _len;
|
||||||
|
images = searchString.match(/\[img\](.*?)\[\/img\]/g);
|
||||||
|
if (images) {
|
||||||
|
for (_i = 0, _len = images.length; _i < _len; _i++) {
|
||||||
|
image = images[_i];
|
||||||
|
detailImage = image.match(/\[img\](.*?)\[\/img\]/)[1];
|
||||||
|
imageParts = detailImage.split('/');
|
||||||
|
imageName = imageParts[imageParts.length - 1];
|
||||||
|
console.log("\t\tdownloading image '" + imageName + "'...");
|
||||||
|
casper.download(detailImage, "data/images/" + imageName);
|
||||||
|
re = new RegExp("\\[img\\]" + detailImage + "\\[/img\\]", "i");
|
||||||
|
searchString = searchString.replace(re, "[img]{{baseurl}}/images/" + imageName + "[/img]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return searchString;
|
||||||
|
};
|
||||||
|
|
||||||
|
replaceHtml = function(element) {
|
||||||
|
var bold, boldElements, colour, colourElements, cursive, cursiveElements, dummyContent, dummyElements, dummySpan, finalText, image, images, link, linkElements, message, quote, quoteAvatar, quoteClear, quoteHeader, quoteHeaderNode, registeredUserNode, underlined, underlinedElements, user, video, videos, _i, _j, _k, _l, _len, _len1, _len2, _len3, _len4, _len5, _len6, _len7, _m, _n, _o, _p;
|
||||||
|
images = Array.prototype.map.call(element.querySelectorAll('img'), function(img) {
|
||||||
|
return {
|
||||||
|
src: img.src,
|
||||||
|
alt: img.hasAttribute('alt') ? img.alt : ''
|
||||||
|
};
|
||||||
|
});
|
||||||
|
for (_i = 0, _len = images.length; _i < _len; _i++) {
|
||||||
|
image = images[_i];
|
||||||
|
element.innerHTML = element.innerHTML.replace(/<img[^>]*>/, "[img" + (image.alt ? '=' + image.alt : '') + "]" + image.src + "[/img]");
|
||||||
|
}
|
||||||
|
videos = Array.prototype.map.call(element.querySelectorAll("iframe[title='YouTube video player']"), function(video) {
|
||||||
|
return video.src.split('/')[4].split('?')[0];
|
||||||
|
});
|
||||||
|
for (_j = 0, _len1 = videos.length; _j < _len1; _j++) {
|
||||||
|
video = videos[_j];
|
||||||
|
element.innerHTML = element.innerHTML.replace(/<iframe title="YouTube video player"[^>]*>.*?<\/iframe>/, "[video]https://www.youtube.com/watch?v=" + video + "[/video]");
|
||||||
|
}
|
||||||
|
cursiveElements = Array.prototype.map.call(element.querySelectorAll('i'), function(cursive) {
|
||||||
|
return cursive.innerHTML;
|
||||||
|
});
|
||||||
|
for (_k = 0, _len2 = cursiveElements.length; _k < _len2; _k++) {
|
||||||
|
cursive = cursiveElements[_k];
|
||||||
|
element.innerHTML = element.innerHTML.replace(/<i>.*?<\/i>/, "[i]" + cursive + "[/i]");
|
||||||
|
}
|
||||||
|
boldElements = Array.prototype.map.call(element.querySelectorAll('b'), function(bold) {
|
||||||
|
return bold.innerHTML;
|
||||||
|
});
|
||||||
|
for (_l = 0, _len3 = boldElements.length; _l < _len3; _l++) {
|
||||||
|
bold = boldElements[_l];
|
||||||
|
element.innerHTML = element.innerHTML.replace(/<b>.*?<\/b>/, "[b]" + bold + "[/b]");
|
||||||
|
}
|
||||||
|
underlinedElements = Array.prototype.map.call(element.querySelectorAll('u'), function(underlined) {
|
||||||
|
return underlined.innerHTML;
|
||||||
|
});
|
||||||
|
for (_m = 0, _len4 = underlinedElements.length; _m < _len4; _m++) {
|
||||||
|
underlined = underlinedElements[_m];
|
||||||
|
element.innerHTML = element.innerHTML.replace(/<u>.*?<\/u>/, "[u]" + underlined + "[/u]");
|
||||||
|
}
|
||||||
|
colourElements = Array.prototype.map.call(element.querySelectorAll('font[color]'), function(colour) {
|
||||||
|
return {
|
||||||
|
name: colour.attributes['color'].value.toLowerCase(),
|
||||||
|
innerHTML: colour.innerHTML
|
||||||
|
};
|
||||||
|
});
|
||||||
|
for (_n = 0, _len5 = colourElements.length; _n < _len5; _n++) {
|
||||||
|
colour = colourElements[_n];
|
||||||
|
element.innerHTML = element.innerHTML.replace(/<font color=".*">[^<\/font>]*<\/font>/, "[colour=" + colour.name + "]" + colour.innerHTML + "[/colour]");
|
||||||
|
}
|
||||||
|
quote = element.querySelector('div.quote_body');
|
||||||
|
while (quote) {
|
||||||
|
quoteHeaderNode = quote.querySelector('div.quote_header');
|
||||||
|
registeredUserNode = quoteHeaderNode ? quote.querySelector('div.quote_header').querySelector('span[itemprop="name"]') : null;
|
||||||
|
user = null;
|
||||||
|
if (registeredUserNode) {
|
||||||
|
user = registeredUserNode.textContent;
|
||||||
|
} else if (quote.parentNode.attributes['author']) {
|
||||||
|
user = quote.parentNode.attributes['author'].value;
|
||||||
|
if (user.substr(0, 1) === '@') {
|
||||||
|
user = user.substr(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
quoteHeader = quote.querySelector('div.quote_header');
|
||||||
|
if (quoteHeader) {
|
||||||
|
quoteHeader.parentNode.removeChild(quoteHeader);
|
||||||
|
}
|
||||||
|
quoteAvatar = quote.querySelector('div.quote_avatar_container');
|
||||||
|
if (quoteAvatar) {
|
||||||
|
quoteAvatar.parentNode.removeChild(quoteAvatar);
|
||||||
|
}
|
||||||
|
quoteClear = quote.querySelector('div.quote_clear');
|
||||||
|
if (quoteClear) {
|
||||||
|
quoteClear.parentNode.removeChild(quoteClear);
|
||||||
|
}
|
||||||
|
message = quote.innerHTML;
|
||||||
|
dummySpan = document.createElement('span');
|
||||||
|
dummySpan.setAttribute('class', 'dummytag');
|
||||||
|
dummySpan.innerHTML = "[quote" + (user ? '=' + user : '') + "]" + message + "[/quote]";
|
||||||
|
quote.parentNode.parentNode.replaceChild(dummySpan, quote.parentNode);
|
||||||
|
quote = element.querySelector('div.quote_body');
|
||||||
|
}
|
||||||
|
dummyElements = Array.prototype.map.call(element.querySelectorAll('span.dummytag'), function(dummy) {
|
||||||
|
return dummy.innerHTML;
|
||||||
|
});
|
||||||
|
for (_o = 0, _len6 = dummyElements.length; _o < _len6; _o++) {
|
||||||
|
dummyContent = dummyElements[_o];
|
||||||
|
element.innerHTML = element.innerHTML.replace(/<span class="dummytag">.*<\/span>/, dummyContent);
|
||||||
|
}
|
||||||
|
linkElements = Array.prototype.map.call(element.querySelectorAll('a[href]'), function(link) {
|
||||||
|
return {
|
||||||
|
target: link.attributes['href'].value,
|
||||||
|
name: link.innerText
|
||||||
|
};
|
||||||
|
});
|
||||||
|
for (_p = 0, _len7 = linkElements.length; _p < _len7; _p++) {
|
||||||
|
link = linkElements[_p];
|
||||||
|
element.innerHTML = element.innerHTML.replace(/<a[^>]*>.*?<\/a>/, "[url=" + link.target + "]" + link.name + "[/url]");
|
||||||
|
}
|
||||||
|
element.innerHTML = element.innerHTML.replace(/<font [^>]*>/g, '');
|
||||||
|
element.innerHTML = element.innerHTML.replace(/<\/font>/g, '');
|
||||||
|
element.innerHTML = element.innerHTML.replace(/<div class="quote_clear"><\/div>/g, '');
|
||||||
|
finalText = element.innerText;
|
||||||
|
finalText = finalText.replace(/<br>/g, '\n');
|
||||||
|
finalText = finalText.replace(/\[img=([^\]]*)\]http:\/\/images\.proboards\.com\/v5\/images\/smiley\/.*?\[\/img\]/g, '$1');
|
||||||
|
finalText = finalText.replace(/\[img=([^\]]*)\]http:\/\/images\.proboards\.com\/v5\/smiley\/.*?\[\/img\]/g, '$1');
|
||||||
|
return finalText;
|
||||||
|
};
|
||||||
|
|
||||||
|
findBoards = function() {
|
||||||
|
var boardDescriptionList, boardDescriptions, boardInfo, boardLinks, boardTitles, boards, i;
|
||||||
|
boards = document.querySelectorAll('tr.board.item td:nth-child(2) > span > a');
|
||||||
|
boardTitles = Array.prototype.map.call(boards, function(e) {
|
||||||
|
return e.textContent;
|
||||||
|
});
|
||||||
|
boardLinks = Array.prototype.map.call(boards, function(e) {
|
||||||
|
return e.href;
|
||||||
|
});
|
||||||
|
boardDescriptions = document.querySelectorAll('tr.board.item td:nth-child(2) > p.description');
|
||||||
|
boardDescriptionList = Array.prototype.map.call(boardDescriptions, function(e) {
|
||||||
|
return e.textContent;
|
||||||
|
});
|
||||||
|
boardInfo = [];
|
||||||
|
i = 0;
|
||||||
|
while (i < boardTitles.length) {
|
||||||
|
boardInfo.push({
|
||||||
|
title: boardTitles[i],
|
||||||
|
description: boardDescriptionList[i],
|
||||||
|
link: boardLinks[i]
|
||||||
|
});
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
return boardInfo;
|
||||||
|
};
|
||||||
|
|
||||||
|
findPages = function() {
|
||||||
|
var lastPage, maxPage, pageBase, pageInfo, pageNr, pages, shownPages;
|
||||||
|
shownPages = document.querySelectorAll('ul.ui-pagination > li.ui-pagination-page.ui-pagination-slot > a[href]');
|
||||||
|
lastPage = shownPages[shownPages.length - 1];
|
||||||
|
pageInfo = /(.*\?page=)(\d*)/.exec(lastPage);
|
||||||
|
pageBase = pageInfo[1];
|
||||||
|
maxPage = pageInfo[2];
|
||||||
|
return pages = (function() {
|
||||||
|
var _i, _results;
|
||||||
|
_results = [];
|
||||||
|
for (pageNr = _i = 1; 1 <= maxPage ? _i <= maxPage : _i >= maxPage; pageNr = 1 <= maxPage ? ++_i : --_i) {
|
||||||
|
_results.push("" + pageBase + pageNr);
|
||||||
|
}
|
||||||
|
return _results;
|
||||||
|
})();
|
||||||
|
};
|
||||||
|
|
||||||
|
findThreads = function() {
|
||||||
|
var i, threadIds, threadInfo, threadLinks, threadTitles, threads;
|
||||||
|
threads = document.querySelectorAll('tr.item.thread > td:nth-child(3) a.thread-link');
|
||||||
|
threadTitles = Array.prototype.map.call(threads, function(e) {
|
||||||
|
return e.textContent;
|
||||||
|
});
|
||||||
|
threadLinks = Array.prototype.map.call(threads, function(e) {
|
||||||
|
return e.href;
|
||||||
|
});
|
||||||
|
threadIds = Array.prototype.map.call(threads, function(e) {
|
||||||
|
return /.*\/thread\/(\d*)\/.*/.exec(e.href)[1];
|
||||||
|
});
|
||||||
|
threadInfo = [];
|
||||||
|
i = 0;
|
||||||
|
while (i < threadTitles.length) {
|
||||||
|
threadInfo.push({
|
||||||
|
id: threadIds[i],
|
||||||
|
title: threadTitles[i],
|
||||||
|
link: threadLinks[i]
|
||||||
|
});
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
return threadInfo;
|
||||||
|
};
|
||||||
|
|
||||||
|
findPosts = function(replaceHtml) {
|
||||||
|
var postInfo;
|
||||||
|
postInfo = Array.prototype.map.call(document.querySelectorAll('tr.item.post'), function(e) {
|
||||||
|
var attachmentName, attachmentNode, attachmentNodes, attachments, dateNode, id, linkSplit, message, messageNode, timestamp, user, userNode, _i, _len;
|
||||||
|
messageNode = e.querySelector('td.content div.message');
|
||||||
|
attachmentNodes = messageNode.querySelectorAll('div.post_attachments blockquote a');
|
||||||
|
dateNode = e.querySelector('td.content span.date > abbr.time');
|
||||||
|
userNode = e.querySelector('td.left-panel a.user-link,td.left-panel > div.mini-profile.guest-mini-profile');
|
||||||
|
id = /post-(\d*)/.exec(e.id)[1];
|
||||||
|
message = replaceHtml(messageNode);
|
||||||
|
attachments = [];
|
||||||
|
for (_i = 0, _len = attachmentNodes.length; _i < _len; _i++) {
|
||||||
|
attachmentNode = attachmentNodes[_i];
|
||||||
|
attachmentName = attachmentNode.text;
|
||||||
|
if (attachmentNode.childElementCount > 0) {
|
||||||
|
attachmentName = attachmentNode.children[0].alt;
|
||||||
|
}
|
||||||
|
attachments.push({
|
||||||
|
name: attachmentName,
|
||||||
|
url: attachmentNode.href
|
||||||
|
});
|
||||||
|
}
|
||||||
|
timestamp = parseInt(dateNode.attributes['data-timestamp'].value, 10) / 1000;
|
||||||
|
user = {};
|
||||||
|
if (userNode.href) {
|
||||||
|
linkSplit = userNode.href.split('/');
|
||||||
|
user = {
|
||||||
|
link: linkSplit[linkSplit.length - 1],
|
||||||
|
name: userNode.textContent
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
user = {
|
||||||
|
link: '',
|
||||||
|
name: userNode.firstChild.data.replace('\n\t', '')
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
id: id,
|
||||||
|
message: message,
|
||||||
|
attachments: attachments,
|
||||||
|
timestamp: timestamp,
|
||||||
|
user: user
|
||||||
|
};
|
||||||
|
});
|
||||||
|
return postInfo;
|
||||||
|
};
|
||||||
|
|
||||||
|
findUserLinks = function() {
|
||||||
|
return Array.prototype.map.call(document.querySelectorAll('div.container.members a.user-link'), function(e) {
|
||||||
|
return e.href;
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
getUser = function(replaceHtml) {
|
||||||
|
var signatureNode, statusNode, user;
|
||||||
|
user = {};
|
||||||
|
user.name = document.querySelectorAll('span.big_username')[0].textContent;
|
||||||
|
signatureNode = document.querySelector('td#center-column > div.content-box:last-child');
|
||||||
|
user.signature = '';
|
||||||
|
if (signatureNode) {
|
||||||
|
user.signature = replaceHtml(signatureNode);
|
||||||
|
if (!/Signature\n/.test(user.signature)) {
|
||||||
|
user.signature = '';
|
||||||
|
}
|
||||||
|
user.signature = user.signature.replace('Signature\n', '');
|
||||||
|
}
|
||||||
|
statusNode = document.querySelectorAll('form.form_user_status div.content-box tr span.personal-text');
|
||||||
|
user.status = statusNode.length > 0 ? statusNode[0].textContent : '';
|
||||||
|
user.registered = parseInt(document.querySelectorAll('td#center-column > div.content-box abbr.time')[0].attributes['data-timestamp'].value, 10) / 1000;
|
||||||
|
return user;
|
||||||
|
};
|
||||||
|
|
||||||
|
missingArgumentError = function(argument) {
|
||||||
|
console.log("missing the " + argument + " argument");
|
||||||
|
return casper.exit();
|
||||||
|
};
|
||||||
|
|
||||||
|
if (casper.cli.options['board-nr']) {
|
||||||
|
proboardNr = casper.cli.options['board-nr'];
|
||||||
|
} else {
|
||||||
|
missingArgumentError('board-nr');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (casper.cli.options['board-name']) {
|
||||||
|
proboardName = casper.cli.options['board-name'];
|
||||||
|
} else {
|
||||||
|
missingArgumentError('board-name');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (casper.cli.options['user']) {
|
||||||
|
user = casper.cli.options['user'];
|
||||||
|
} else {
|
||||||
|
missingArgumentError('user');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (casper.cli.options['password']) {
|
||||||
|
password = casper.cli.options['password'];
|
||||||
|
} else {
|
||||||
|
missingArgumentError('password');
|
||||||
|
}
|
||||||
|
|
||||||
|
proboardUrl = "http://" + proboardName + ".proboards.com/";
|
||||||
|
|
||||||
|
proboardUserUrl = "" + proboardUrl + "members";
|
||||||
|
|
||||||
|
casper.userAgent('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:30.0) Gecko/20100101 Firefox/30.0');
|
||||||
|
|
||||||
|
casper.start(proboardUrl, function() {});
|
||||||
|
|
||||||
|
casper.thenOpen('https://login.proboards.com/forum_submit/login', {
|
||||||
|
method: 'post',
|
||||||
|
data: {
|
||||||
|
forum: proboardNr,
|
||||||
|
email: user,
|
||||||
|
password: password,
|
||||||
|
"continue": 'Continue'
|
||||||
|
}
|
||||||
|
}, function() {});
|
||||||
|
|
||||||
|
readBoard = function(board) {
|
||||||
|
return casper.thenOpen(board.link, function() {
|
||||||
|
board.boards = this.evaluate(findBoards);
|
||||||
|
this.each(board.boards, function(casper, subboard) {
|
||||||
|
return readBoard(subboard);
|
||||||
|
});
|
||||||
|
return this.thenOpen(board.link, function() {
|
||||||
|
var boardPages;
|
||||||
|
this.echo("getting threads for board '" + board.title + "'...");
|
||||||
|
board.threads = [];
|
||||||
|
boardPages = this.evaluate(findPages);
|
||||||
|
this.each(boardPages, function(casper, boardPage) {
|
||||||
|
return this.thenOpen(boardPage, function() {
|
||||||
|
return board.threads = board.threads.concat(this.evaluate(findThreads));
|
||||||
|
});
|
||||||
|
});
|
||||||
|
return this.then(function() {
|
||||||
|
this.then(function() {
|
||||||
|
return board.threads = board.threads[0];
|
||||||
|
});
|
||||||
|
return this.each(board.threads, function(casper, thread) {
|
||||||
|
thread.posts = [];
|
||||||
|
return this.thenOpen(thread.link, function() {
|
||||||
|
var linkParts, pollName, threadPages;
|
||||||
|
this.echo("\tgetting posts for thread '" + thread.title + "'...");
|
||||||
|
pollName = null;
|
||||||
|
if (this.exists('div.poll.show.ui-poll')) {
|
||||||
|
console.log('\t\tsaving poll...');
|
||||||
|
linkParts = thread.link.split('/');
|
||||||
|
pollName = "" + linkParts[linkParts.length - 1] + ".png";
|
||||||
|
this.captureSelector("data/images/polls/" + pollName, 'div.poll.show.ui-poll');
|
||||||
|
}
|
||||||
|
thread.poll = pollName;
|
||||||
|
threadPages = this.evaluate(findPages);
|
||||||
|
this.each(threadPages, function(casper, threadPage) {
|
||||||
|
return this.thenOpen(threadPage, function() {
|
||||||
|
var posts;
|
||||||
|
posts = this.evaluate(findPosts, replaceHtml);
|
||||||
|
this.each(posts, function(casper, post) {
|
||||||
|
var attachment, _i, _len, _ref, _results;
|
||||||
|
post.message = loadImages(post.message.message);
|
||||||
|
utils.dump(post);
|
||||||
|
_ref = post.attachments;
|
||||||
|
_results = [];
|
||||||
|
for (_i = 0, _len = _ref.length; _i < _len; _i++) {
|
||||||
|
attachment = _ref[_i];
|
||||||
|
casper.download(attachment.url, "data/attachments/" + attachment.name);
|
||||||
|
_results.push(attachment.url = "{{baseurl}}/attachments/" + attachment.name);
|
||||||
|
}
|
||||||
|
return _results;
|
||||||
|
});
|
||||||
|
utils.dump(posts);
|
||||||
|
return thread.posts = thread.posts.concat(posts);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
return this.then(function() {
|
||||||
|
if (thread.poll && thread.posts[0]) {
|
||||||
|
thread.posts[0].message = "[img]{{baseurl}}/images/polls/" + thread.poll + "[/img]\n\n" + thread.posts[0].message;
|
||||||
|
}
|
||||||
|
if (thread.poll && !thread.posts[0]) {
|
||||||
|
return console.log("how the fuck did you manage that?");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
proboard = {};
|
||||||
|
|
||||||
|
casper.thenOpen(proboardUrl, function() {
|
||||||
|
proboard.boards = this.evaluate(findBoards);
|
||||||
|
return readBoard(proboard.boards[0]);
|
||||||
|
});
|
||||||
|
|
||||||
|
casper.then(function() {
|
||||||
|
var json;
|
||||||
|
json = JSON.stringify(proboard, null, '\t');
|
||||||
|
return fs.write("data/" + proboardName + ".json", json, 'w');
|
||||||
|
});
|
||||||
|
|
||||||
|
casper.run();
|
||||||
|
|
||||||
|
}).call(this);
|
Loading…
Reference in New Issue
Block a user