|
|
@@ -0,0 +1,83 @@
|
|
|
+(in-package :cl-user)
|
|
|
+(defpackage chatikbot.scraping
|
|
|
+ (:use :cl :chatikbot.common)
|
|
|
+ (:export :*cookie-jar*
|
|
|
+ :*current-dom*
|
|
|
+ :*current-status*
|
|
|
+ :*current-headers*
|
|
|
+ :*current-uri*
|
|
|
+ :do-request
|
|
|
+ :do-form
|
|
|
+ :have-cookies
|
|
|
+ :with-request
|
|
|
+ :scrape-list))
|
|
|
+(in-package :chatikbot.scraping)
|
|
|
+
|
|
|
+(defvar *cookie-jar* (cl-cookie:make-cookie-jar) "Currently active cookies")
|
|
|
+(defvar *current-dom* nil "Currently active request dom")
|
|
|
+(defvar *current-status* nil "Currently active request status")
|
|
|
+(defvar *current-headers* nil "Currently active request headers")
|
|
|
+(defvar *current-uri* nil "Currently active request uri")
|
|
|
+
|
|
|
+(defun apply-context (template context)
|
|
|
+ (sublis context template))
|
|
|
+
|
|
|
+(defun do-request (template &optional context)
|
|
|
+ (let ((args (apply-context template context)))
|
|
|
+ (apply 'xml-request `(,@args :cookie-jar ,*cookie-jar*))))
|
|
|
+
|
|
|
+(defmacro with-request ((template &optional context) &body body)
|
|
|
+ `(multiple-value-bind (*current-dom* *current-status* *current-headers* *current-uri*)
|
|
|
+ (do-request ,template ,context)
|
|
|
+ (declare (ignorable *current-dom* *current-status* *current-headers* *current-uri*))
|
|
|
+ ,@body))
|
|
|
+
|
|
|
+(defun node-full-url (node attr &optional (uri *current-uri*))
|
|
|
+ (quri:render-uri (quri:merge-uris (quri:uri (plump:attribute node attr)) uri)))
|
|
|
+
|
|
|
+(defun do-form (form-info context)
|
|
|
+ (destructuring-bind (request selector template) form-info
|
|
|
+ (with-request (request context)
|
|
|
+ (let ((form (clss:select selector *current-dom*)))
|
|
|
+ (unless (zerop (length form))
|
|
|
+ (let* ((form (elt form 0))
|
|
|
+ (action (node-full-url form "action"))
|
|
|
+ (method (or (keyify (plump:attribute form "method")) :get))
|
|
|
+ (content (append (apply-context template context)
|
|
|
+ (loop for input across (clss:select "input,select" form)
|
|
|
+ for name = (plump:attribute input "name")
|
|
|
+ for value = (plump:attribute input "value")
|
|
|
+ when name
|
|
|
+ unless (agets template name)
|
|
|
+ collect (cons name value)))))
|
|
|
+ (do-request `(,action :method ,method :content ,content))))))))
|
|
|
+
|
|
|
+(defun have-cookies (request)
|
|
|
+ (let ((uri (quri:uri (car request))))
|
|
|
+ (cl-cookie:cookie-jar-host-cookies
|
|
|
+ *cookie-jar* (quri:uri-host uri) (or (quri:uri-path uri) "/")
|
|
|
+ :securep (string= (quri:uri-scheme uri) "https"))))
|
|
|
+
|
|
|
+(defun extract-from-node (node template &optional (uri *current-uri*))
|
|
|
+ (destructuring-bind (selector &optional post) template
|
|
|
+ (let* ((nodes (clss:select selector node))
|
|
|
+ (node (and (not (zerop (length nodes)))
|
|
|
+ (elt nodes 0))))
|
|
|
+ (when node
|
|
|
+ (cond
|
|
|
+ ((eq :href post) (node-full-url node "href" uri))
|
|
|
+ ((stringp post) (plump:attribute node post))
|
|
|
+ (t (unspacify (text-with-cdata node))))))))
|
|
|
+
|
|
|
+(defun extract-info (items-selector info-templates &key limit (dom *current-dom*) (uri *current-uri*))
|
|
|
+ (let ((nodes (clss:select items-selector dom)))
|
|
|
+ (when limit
|
|
|
+ (setf nodes (subseq nodes 0 (min limit (length nodes)))))
|
|
|
+ (loop for node across nodes
|
|
|
+ collect (loop for (key . template) in info-templates
|
|
|
+ collect (cons key (extract-from-node node template uri))))))
|
|
|
+
|
|
|
+(defun scrape-list (info &optional context)
|
|
|
+ (destructuring-bind (request item-selector item-info) info
|
|
|
+ (with-request (request context)
|
|
|
+ (extract-info item-selector item-info))))
|