| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- (in-package :cl-user)
- (defpackage chatikbot.scraping
- (:use :cl :chatikbot.common)
- (:export :*cookie-jar*
- :*current-dom*
- :*current-status*
- :*current-headers*
- :*current-uri*
- :do-request
- :do-form
- :have-cookies
- :with-request
- :scrape-list))
- (in-package :chatikbot.scraping)
- (defvar *cookie-jar* (cl-cookie:make-cookie-jar) "Currently active cookies")
- (defvar *current-dom* nil "Currently active request dom")
- (defvar *current-status* nil "Currently active request status")
- (defvar *current-headers* nil "Currently active request headers")
- (defvar *current-uri* nil "Currently active request uri")
- (defun apply-context (template context)
- (sublis context template))
- (defun do-request (template &optional context)
- (let ((args (apply-context template context)))
- (apply 'xml-request `(,@args :cookie-jar ,*cookie-jar*))))
- (defmacro with-request ((template &optional context) &body body)
- `(multiple-value-bind (*current-dom* *current-status* *current-headers* *current-uri*)
- (do-request ,template ,context)
- (declare (ignorable *current-dom* *current-status* *current-headers* *current-uri*))
- ,@body))
- (defun node-full-url (node attr &optional (uri *current-uri*))
- (quri:render-uri (quri:merge-uris (quri:uri (plump:attribute node attr)) uri)))
- (defun do-form (form-info context)
- (destructuring-bind (request selector template) form-info
- (with-request (request context)
- (let ((form (clss:select selector *current-dom*)))
- (unless (zerop (length form))
- (let* ((form (elt form 0))
- (action (node-full-url form "action"))
- (method (or (keyify (plump:attribute form "method")) :get))
- (content (append (apply-context template context)
- (loop for input across (clss:select "input,select" form)
- for name = (plump:attribute input "name")
- for value = (plump:attribute input "value")
- when name
- unless (agets template name)
- collect (cons name value)))))
- (do-request `(,action :method ,method :content ,content))))))))
- (defun have-cookies (request)
- (let ((uri (quri:uri (car request))))
- (cl-cookie:cookie-jar-host-cookies
- *cookie-jar* (quri:uri-host uri) (or (quri:uri-path uri) "/")
- :securep (string= (quri:uri-scheme uri) "https"))))
- (defun extract-from-node (node template &optional (uri *current-uri*))
- (destructuring-bind (selector &optional post) template
- (let* ((nodes (clss:select selector node))
- (node (and (not (zerop (length nodes)))
- (elt nodes 0))))
- (when node
- (cond
- ((eq :href post) (node-full-url node "href" uri))
- ((stringp post) (plump:attribute node post))
- (t (unspacify (text-with-cdata node))))))))
- (defun extract-info (items-selector info-templates &key limit (dom *current-dom*) (uri *current-uri*))
- (let ((nodes (clss:select items-selector dom)))
- (when limit
- (setf nodes (subseq nodes 0 (min limit (length nodes)))))
- (loop for node across nodes
- collect (loop for (key . template) in info-templates
- collect (cons key (extract-from-node node template uri))))))
- (defun scrape-list (info &optional context)
- (destructuring-bind (request item-selector item-info) info
- (with-request (request context)
- (extract-info item-selector item-info))))
|