initial commit
This commit is contained in:
commit
fc83ea25a9
6 changed files with 861 additions and 0 deletions
377
src/proboard_saver.coffee
Normal file
377
src/proboard_saver.coffee
Normal file
|
@ -0,0 +1,377 @@
|
|||
# 2015 by Sebastian Hugentobler <shugentobler@vanwa.ch>
|
||||
# To the extent possible under law, the author(s) have dedicated all copyright
|
||||
# and related and neighboring rights to this software to the public domain
|
||||
# worldwide. This software is distributed without any warranty.
|
||||
# See http://creativecommons.org/publicdomain/zero/1.0/ for a description of CC0.
|
||||
|
||||
casper = require('casper').create(
|
||||
verbose: false
|
||||
logLevel: 'info'
|
||||
pageSettings: {
|
||||
webSecurityEnabled: false
|
||||
}
|
||||
)
|
||||
|
||||
utils = require('utils')
|
||||
fs = require('fs')
|
||||
|
||||
casper.on 'error', (msg, trace) ->
|
||||
@echo "Error: #{msg}", "ERROR"
|
||||
|
||||
casper.on 'page.error', (msg, trace) ->
|
||||
@echo "Error: #{msg}", "ERROR"
|
||||
|
||||
casper.on 'remote.message', (msg, trace) ->
|
||||
if not /Unsafe JavaScript attempt/.test msg
|
||||
@echo "remote log: #{msg}", "INFO"
|
||||
|
||||
loadImages = (searchString) ->
|
||||
images = searchString.match /\[img\](.*?)\[\/img\]/g
|
||||
if images
|
||||
for image in images
|
||||
detailImage = image.match(/\[img\](.*?)\[\/img\]/)[1]
|
||||
|
||||
imageParts = detailImage.split '/'
|
||||
imageName = imageParts[imageParts.length - 1]
|
||||
console.log "\t\tdownloading image '#{imageName}'..."
|
||||
|
||||
casper.download detailImage, "data/images/#{imageName}"
|
||||
|
||||
re = new RegExp("\\[img\\]#{detailImage}\\[/img\\]", "i")
|
||||
searchString = searchString.replace re, "[img]{{baseurl}}/images/#{imageName}[/img]"
|
||||
|
||||
return searchString
|
||||
|
||||
replaceHtml = (element) ->
|
||||
images = Array::map.call element.querySelectorAll('img'), (img) ->
|
||||
src: img.src, alt: if img.hasAttribute('alt') then img.alt else ''
|
||||
|
||||
for image in images
|
||||
element.innerHTML = element.innerHTML.replace /<img[^>]*>/, "[img#{if image.alt then '=' + image.alt else ''}]#{image.src}[/img]"
|
||||
|
||||
videos = Array::map.call element.querySelectorAll("iframe[title='YouTube video player']"), (video) -> video.src.split('/')[4].split('?')[0]
|
||||
for video in videos
|
||||
element.innerHTML = element.innerHTML.replace /<iframe title="YouTube video player"[^>]*>.*?<\/iframe>/, "[video]https://www.youtube.com/watch?v=#{video}[/video]"
|
||||
|
||||
cursiveElements = Array::map.call element.querySelectorAll('i'), (cursive) -> cursive.innerHTML
|
||||
for cursive in cursiveElements
|
||||
element.innerHTML = element.innerHTML.replace /<i>.*?<\/i>/, "[i]#{cursive}[/i]"
|
||||
|
||||
boldElements = Array::map.call element.querySelectorAll('b'), (bold) -> bold.innerHTML
|
||||
for bold in boldElements
|
||||
element.innerHTML = element.innerHTML.replace /<b>.*?<\/b>/, "[b]#{bold}[/b]"
|
||||
|
||||
underlinedElements = Array::map.call element.querySelectorAll('u'), (underlined) -> underlined.innerHTML
|
||||
for underlined in underlinedElements
|
||||
element.innerHTML = element.innerHTML.replace /<u>.*?<\/u>/, "[u]#{underlined}[/u]"
|
||||
|
||||
colourElements = Array::map.call element.querySelectorAll('font[color]'), (colour) -> name: colour.attributes['color'].value.toLowerCase(), innerHTML: colour.innerHTML
|
||||
for colour in colourElements
|
||||
element.innerHTML = element.innerHTML.replace /<font color=".*">[^<\/font>]*<\/font>/, "[colour=#{colour.name}]#{colour.innerHTML}[/colour]"
|
||||
|
||||
quote = element.querySelector 'div.quote_body'
|
||||
while quote
|
||||
quoteHeaderNode = quote.querySelector 'div.quote_header'
|
||||
|
||||
registeredUserNode = if quoteHeaderNode then quote.querySelector('div.quote_header').querySelector('span[itemprop="name"]') else null
|
||||
|
||||
user = null
|
||||
|
||||
if registeredUserNode
|
||||
user = registeredUserNode.textContent
|
||||
else if quote.parentNode.attributes['author']
|
||||
user = quote.parentNode.attributes['author'].value
|
||||
if user.substr(0, 1) == '@'
|
||||
user = user.substr 1
|
||||
|
||||
quoteHeader = quote.querySelector 'div.quote_header'
|
||||
if quoteHeader then quoteHeader.parentNode.removeChild quoteHeader
|
||||
|
||||
quoteAvatar = quote.querySelector 'div.quote_avatar_container'
|
||||
if quoteAvatar then quoteAvatar.parentNode.removeChild quoteAvatar
|
||||
|
||||
quoteClear = quote.querySelector 'div.quote_clear'
|
||||
if quoteClear then quoteClear.parentNode.removeChild quoteClear
|
||||
|
||||
message = quote.innerHTML
|
||||
|
||||
dummySpan = document.createElement 'span'
|
||||
dummySpan.setAttribute 'class', 'dummytag'
|
||||
dummySpan.innerHTML = "[quote#{if user then '=' + user else ''}]#{message}[/quote]"
|
||||
|
||||
quote.parentNode.parentNode.replaceChild dummySpan, quote.parentNode
|
||||
|
||||
quote = element.querySelector 'div.quote_body'
|
||||
|
||||
dummyElements = Array::map.call element.querySelectorAll('span.dummytag'), (dummy) -> dummy.innerHTML
|
||||
for dummyContent in dummyElements
|
||||
element.innerHTML = element.innerHTML.replace /<span class="dummytag">.*<\/span>/, dummyContent
|
||||
|
||||
linkElements = Array::map.call element.querySelectorAll('a[href]'), (link) -> target: link.attributes['href'].value, name: link.innerText
|
||||
for link in linkElements
|
||||
element.innerHTML = element.innerHTML.replace /<a[^>]*>.*?<\/a>/, "[url=#{link.target}]#{link.name}[/url]"
|
||||
|
||||
element.innerHTML = element.innerHTML.replace /<font [^>]*>/g, ''
|
||||
element.innerHTML = element.innerHTML.replace /<\/font>/g, ''
|
||||
element.innerHTML = element.innerHTML.replace /<div class="quote_clear"><\/div>/g, ''
|
||||
|
||||
finalText = element.innerText
|
||||
finalText = finalText.replace /<br>/g, '\n'
|
||||
finalText = finalText.replace /\[img=([^\]]*)\]http:\/\/images\.proboards\.com\/v5\/images\/smiley\/.*?\[\/img\]/g, '$1'
|
||||
finalText = finalText.replace /\[img=([^\]]*)\]http:\/\/images\.proboards\.com\/v5\/smiley\/.*?\[\/img\]/g, '$1'
|
||||
|
||||
attachmentIndex = finalText.indexOf('\n\n[b]Attachments:[/b]\n\n')
|
||||
if attachmentIndex > -1
|
||||
finalText = finalText.substring 0, attachmentIndex
|
||||
|
||||
return finalText
|
||||
|
||||
findBoards = ->
|
||||
boards = document.querySelectorAll('tr.board.item td:nth-child(2) > span > a')
|
||||
|
||||
boardTitles = Array::map.call boards, (e) -> e.textContent
|
||||
boardLinks = Array::map.call boards, (e) -> e.href
|
||||
|
||||
boardDescriptions = document.querySelectorAll('tr.board.item td:nth-child(2) > p.description')
|
||||
boardDescriptionList = Array::map.call boardDescriptions, (e) -> e.textContent
|
||||
|
||||
boardInfo = []
|
||||
|
||||
i = 0
|
||||
while i < boardTitles.length
|
||||
boardInfo.push
|
||||
title: boardTitles[i]
|
||||
description: boardDescriptionList[i]
|
||||
link: boardLinks[i]
|
||||
|
||||
i++
|
||||
|
||||
return boardInfo
|
||||
|
||||
findPages = ->
|
||||
shownPages = document.querySelectorAll('ul.ui-pagination > li.ui-pagination-page.ui-pagination-slot > a[href]')
|
||||
lastPage = shownPages[shownPages.length - 1]
|
||||
|
||||
pageInfo = /(.*\?page=)(\d*)/.exec lastPage
|
||||
pageBase = pageInfo[1]
|
||||
maxPage = pageInfo[2]
|
||||
|
||||
pages = ("#{pageBase}#{pageNr}" for pageNr in [1..maxPage])
|
||||
|
||||
findThreads = ->
|
||||
threads = document.querySelectorAll('tr.item.thread > td:nth-child(3) a.thread-link')
|
||||
|
||||
threadTitles = Array::map.call threads, (e) -> e.textContent
|
||||
threadLinks = Array::map.call threads, (e) -> e.href
|
||||
threadIds = Array::map.call threads, (e) ->
|
||||
/.*\/thread\/(\d*)\/.*/.exec(e.href)[1]
|
||||
|
||||
threadInfo = []
|
||||
|
||||
i = 0
|
||||
while i < threadTitles.length
|
||||
threadInfo.push
|
||||
id: threadIds[i]
|
||||
title: threadTitles[i]
|
||||
link: threadLinks[i]
|
||||
|
||||
i++
|
||||
|
||||
return threadInfo
|
||||
|
||||
findPosts = (replaceHtml) ->
|
||||
postInfo = Array::map.call document.querySelectorAll('tr.item.post'), (e) ->
|
||||
messageNode = e.querySelector('td.content div.message')
|
||||
attachmentNodes = messageNode.querySelectorAll('div.post_attachments blockquote a')
|
||||
dateNode = e.querySelector('td.content span.date > abbr.time')
|
||||
userNode = e.querySelector('td.left-panel a.user-link,td.left-panel > div.mini-profile.guest-mini-profile')
|
||||
|
||||
id = /post-(\d*)/.exec(e.id)[1]
|
||||
message = replaceHtml(messageNode)
|
||||
|
||||
attachments = []
|
||||
for attachmentNode in attachmentNodes
|
||||
attachmentName = attachmentNode.text
|
||||
if attachmentNode.childElementCount > 0
|
||||
attachmentName = attachmentNode.children[0].alt
|
||||
|
||||
attachments.push
|
||||
name: attachmentName
|
||||
url: attachmentNode.href
|
||||
|
||||
timestamp = parseInt(dateNode.attributes['data-timestamp'].value, 10) / 1000
|
||||
|
||||
user = { }
|
||||
if userNode.href
|
||||
linkSplit = userNode.href.split '/'
|
||||
user = { link: linkSplit[linkSplit.length - 1], name: userNode.textContent }
|
||||
else
|
||||
user = { link: '', name: userNode.firstChild.data.replace '\n\t', '' }
|
||||
|
||||
return {
|
||||
id: id,
|
||||
message: message,
|
||||
attachments: attachments,
|
||||
timestamp: timestamp,
|
||||
user: user
|
||||
}
|
||||
|
||||
return postInfo
|
||||
|
||||
findUserLinks = ->
|
||||
Array::map.call document.querySelectorAll('div.container.members a.user-link'), (e) -> e.href
|
||||
|
||||
getUser = (replaceHtml) ->
|
||||
user = {}
|
||||
|
||||
user.name = document.querySelectorAll('span.big_username')[0].textContent
|
||||
|
||||
signatureNode = document.querySelector('td#center-column > div.content-box:last-child')
|
||||
|
||||
user.signature = ''
|
||||
|
||||
if signatureNode
|
||||
user.signature = replaceHtml signatureNode
|
||||
|
||||
if not /Signature\n/.test user.signature
|
||||
user.signature = ''
|
||||
|
||||
user.signature = user.signature.replace 'Signature\n', ''
|
||||
|
||||
statusNode = document.querySelectorAll('form.form_user_status div.content-box tr span.personal-text')
|
||||
user.status = if statusNode.length > 0 then statusNode[0].textContent else ''
|
||||
|
||||
user.registered = parseInt(document.querySelectorAll('td#center-column > div.content-box abbr.time')[0].attributes['data-timestamp'].value, 10) / 1000
|
||||
|
||||
return user
|
||||
|
||||
missingArgumentError = (argument) ->
|
||||
console.log "missing the #{ argument } argument"
|
||||
casper.exit()
|
||||
|
||||
if casper.cli.options['board-nr']
|
||||
proboardNr = casper.cli.options['board-nr']
|
||||
else
|
||||
missingArgumentError 'board-nr'
|
||||
|
||||
if casper.cli.options['board-name']
|
||||
proboardName = casper.cli.options['board-name']
|
||||
else
|
||||
missingArgumentError 'board-name'
|
||||
|
||||
if casper.cli.options['user']
|
||||
user = casper.cli.options['user']
|
||||
else
|
||||
missingArgumentError 'user'
|
||||
|
||||
if casper.cli.options['password']
|
||||
password = casper.cli.options['password']
|
||||
else
|
||||
missingArgumentError 'password'
|
||||
|
||||
proboardUrl = "http://#{ proboardName }.proboards.com/"
|
||||
proboardUserUrl = "#{ proboardUrl }members"
|
||||
|
||||
casper.userAgent 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:30.0) Gecko/20100101 Firefox/30.0'
|
||||
|
||||
|
||||
casper.start proboardUrl, ->
|
||||
|
||||
casper.thenOpen 'https://login.proboards.com/forum_submit/login',
|
||||
method: 'post'
|
||||
data:
|
||||
forum: proboardNr
|
||||
email: user
|
||||
password: password
|
||||
continue: 'Continue'
|
||||
, ->
|
||||
|
||||
readBoard = (board) ->
|
||||
casper.thenOpen board.link, ->
|
||||
board.boards = @evaluate findBoards
|
||||
|
||||
@each board.boards, (casper, subboard) ->
|
||||
readBoard subboard
|
||||
|
||||
@thenOpen board.link, ->
|
||||
@echo "getting threads for board '#{ board.title }'..."
|
||||
|
||||
board.threads = []
|
||||
|
||||
boardPages = @evaluate findPages
|
||||
|
||||
@each boardPages, (casper, boardPage) ->
|
||||
@thenOpen boardPage, ->
|
||||
board.threads = board.threads.concat @evaluate findThreads
|
||||
|
||||
@then ->
|
||||
@each board.threads, (casper, thread) ->
|
||||
thread.posts = []
|
||||
|
||||
@thenOpen thread.link, ->
|
||||
@echo "\tgetting posts for thread '#{ thread.title }'..."
|
||||
|
||||
pollName = null
|
||||
if @exists 'div.poll.show.ui-poll'
|
||||
console.log '\t\tsaving poll...'
|
||||
|
||||
linkParts = thread.link.split '/'
|
||||
pollName = "#{linkParts[linkParts.length - 1]}.png"
|
||||
|
||||
@captureSelector "data/images/polls/#{pollName}", 'div.poll.show.ui-poll'
|
||||
|
||||
thread.poll = pollName
|
||||
|
||||
threadPages = @evaluate findPages
|
||||
|
||||
@each threadPages, (casper, threadPage) ->
|
||||
@thenOpen threadPage, ->
|
||||
posts = @evaluate findPosts, replaceHtml
|
||||
|
||||
@each posts, (casper, post) ->
|
||||
post.message = loadImages post.message
|
||||
|
||||
for attachment in post.attachments
|
||||
casper.download attachment.url, "data/attachments/#{attachment.name}"
|
||||
attachment.url = "{{baseurl}}/attachments/#{attachment.name}"
|
||||
|
||||
thread.posts = thread.posts.concat posts
|
||||
|
||||
@then ->
|
||||
if thread.poll and thread.posts[0]
|
||||
thread.posts[0].message = "[img]{{baseurl}}/images/polls/#{thread.poll}[/img]\n\n#{thread.posts[0].message}"
|
||||
|
||||
if thread.poll and not thread.posts[0]
|
||||
console.log "how the fuck did you manage that?"
|
||||
|
||||
proboard = {}
|
||||
|
||||
casper.thenOpen proboardUrl, ->
|
||||
proboard.boards = @evaluate findBoards
|
||||
|
||||
@each proboard.boards, (casper, board) ->
|
||||
readBoard board
|
||||
|
||||
casper.thenOpen proboardUserUrl, ->
|
||||
proboard.users = []
|
||||
|
||||
userPages = @evaluate findPages
|
||||
|
||||
@each userPages, (casper, userPage) ->
|
||||
@thenOpen userPage, ->
|
||||
userlinks = @evaluate findUserLinks
|
||||
|
||||
@each userlinks, (casper, userlink) ->
|
||||
@thenOpen userlink, ->
|
||||
@echo "getting userinfo for '#{ userlink }'..."
|
||||
|
||||
user = @evaluate getUser, replaceHtml
|
||||
user.signature = loadImages user.signature
|
||||
|
||||
proboard.users = proboard.users.concat user
|
||||
|
||||
casper.then ->
|
||||
json = JSON.stringify(proboard, null, '\t')
|
||||
fs.write "data/#{ proboardName }.json", json, 'w'
|
||||
|
||||
casper.run()
|
Loading…
Add table
Add a link
Reference in a new issue