How to crawl a webpage using JSoup Library: Complete Working Example

The following example shows you couple of things:

  • How to read a webpage and get the entire HTML and title
  • How to read a webpage and extract all the text link including the text on which the hyperlink is.

package org.inepal.products.nlp.crawler;

import org.inepal.products.nlp.domains.Article;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.List;

/**
 * @author Kushal Paudyal
 * Creatd on 11/11/2018
 * www.icodejava.com | www.inepal.org | www.sanjaal.com
 */
public class WebCrawler {

    private static final Logger log = LoggerFactory.getLogger(WebCrawler.class);

    //TODO: Comment the main method
    public static void main(String args []) {
        findAllLinks("https://rt.com/");
    }

    /**
     * Crawls a web page
     *
     * @param url - URL to crawl
     * @return crawled result as Article object
     */
    public static Article crawlAsArticle(String url) {

        Article article = null;

        Document doc;
        try {
            doc = Jsoup.connect(url).userAgent("Mozilla").get();

            article = new Article();
            article.setHtml(doc.html());
            article.setText(doc.body().text());
            article.setTitle(doc.title());

        } catch (Exception e) {
            log.error("Could not crawl " + url + "\n" + e.getMessage());
        }

        return article;
    }

    public static List<TextLink> findAllLinks(String url) {

        List<TextLink> foundTextLinks = new ArrayList<>();

        try {
            Document doc = Jsoup.connect(url).get();
            Elements links = doc.select("a[href]");


            TextLink textLink = null;
            for (Element link : links) {
                textLink = new TextLink();
                textLink.setLink(link.attr("abs:href"));
                textLink.setText(link.text().trim());

                foundTextLinks.add(textLink);

                log.info(textLink.toString());

            }

            log.info("Found {} Text Links while crawling {}", foundTextLinks.size(), url);

        } catch (Exception e) {
            e.printStackTrace();
        }

        return foundTextLinks;
    }

}

Here is the Article POJO class

package org.inepal.products.nlp.domains;

/**
 * This class represents an article which could be from various sources such as Webpage, or a file.
 * @author Kushal Paudyal
 *
 * www.icodejava.com | www.inepal.org | www.sanjaal.com
 *
 */

public class Article {

	private int id;
	private String html;
	private String text;
	private String title;

	public Article() {

	}

	public Article(int id, String html, String text, String title) {
		super();
		this.id = id;
		this.html = html;
		this.text = text;
		this.title = title;
	}

	public int getId() {
		return id;
	}

	public void setId(int id) {
		this.id = id;
	}

	public String getHtml() {
		return html;
	}

	public void setHtml(String html) {
		this.html = html;
	}

	public String getText() {
		return text;
	}

	public void setText(String text) {
		this.text = text;
	}

	public String getTitle() {
		return title;
	}

	public void setTitle(String title) {
		this.title = title;
	}

}

Here is the console output of running this program. I found out that some websites do not allow crawling. I tried to crawl the homepage of CNN using this code, and it would not get any link. However, RT.com did work.

22:15:57.829 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.facebook.com/RTnews/', text='facebook'}
22:15:57.834 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://twitter.com/RT_com', text='twitter'}
22:15:57.834 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.youtube.com/user/RussiaToday', text='youtube'}
22:15:57.834 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://instagram.com/rt', text='instagram'}
22:15:57.834 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/applications/', text='Applications'}
22:15:57.834 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/rss', text='RSS'}
22:15:57.834 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://arabic.rt.com/', text='العربية'}
22:15:57.834 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://actualidad.rt.com/', text='ESP'}
22:15:57.834 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://russian.rt.com/', text='РУС'}
22:15:57.835 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://deutsch.rt.com/', text='DE'}
22:15:57.835 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://francais.rt.com/', text='FR'}
22:15:57.835 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://russian.rt.com/inotv', text='ИНОТВ'}
22:15:57.835 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://rtd.rt.com/', text='RTД'}
22:15:57.835 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://ruptly.tv/', text='RUPTLY'}
22:15:57.835 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/', text='RTQuestion more'}
22:15:57.835 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/on-air/', text='live'}
22:15:57.835 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/', text='News'}
22:15:57.835 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/usa/', text='USA'}
22:15:57.835 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/uk/', text='UK'}
22:15:57.835 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/sport/', text='Sport'}
22:15:57.835 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/russia/', text='Russia'}
22:15:57.836 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/business/', text='Business'}
22:15:57.836 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/op-ed/', text='Op-ed'}
22:15:57.836 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/360/', text='RT360'}
22:15:57.836 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/', text='Shows'}
22:15:57.836 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/projects/', text='Projects'}
22:15:57.836 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.facebook.com/RTnews/', text='facebook'}
22:15:57.836 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://twitter.com/RT_com', text='twitter'}
22:15:57.836 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.youtube.com/user/RussiaToday', text='youtube'}
22:15:57.836 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://instagram.com/rt', text='instagram'}
22:15:57.836 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443699-wwi-trump-putin-macron-femen/', text='Putin chats with Trump, Macron thrashes nationalism at WWI centenary in Paris'}
22:15:57.836 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443699-wwi-trump-putin-macron-femen/', text=''}
22:15:57.836 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443693-gaza-israeli-op-palestinians-killed/', text='At least 6 Palestinians, 1 Israeli killed in IDF ‘operation’ in Gaza'}
22:15:57.837 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443693-gaza-israeli-op-palestinians-killed/', text=''}
22:15:57.837 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/tags/rt-exclusive/', text='RT Exclusive'}
22:15:57.837 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443677-putin-macron-army-europe-trump/', text='‘Good for multipolar world’: Putin positive on Macron’s ‘European army’ plan bashed by Trump (VIDEO)'}
22:15:57.837 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443677-putin-macron-army-europe-trump/', text=''}
22:15:57.837 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443697-far-right-independence-march-poland/', text='WATCH thousands of far-right marchers light a SEA OF FLARES in Poland as police look on (VIDEO)'}
22:15:57.837 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443697-far-right-independence-march-poland/', text=''}
22:15:57.837 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443694-raqqa-bodies-destruction-report/', text='‘Left to count their dead’: Striking report breaks MSM silence on plight of ‘liberated’ Raqqa'}
22:15:57.837 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443694-raqqa-bodies-destruction-report/', text=''}
22:15:57.837 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443672-ww1-audio-final-minute/', text=''}
22:15:57.837 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443672-ww1-audio-final-minute/', text='Moment WWI’s guns fell silent captured on striking 100yo AUDIO'}
22:15:57.837 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/newsline/', text='Newsline'}
22:15:57.838 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/newsline/443702-california-fire-death-toll-rises/', text='Death toll from California wildfires rises to 31, over 220 missing'}
22:15:57.838 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443667-ww1-commemoration-paris-leaders/', text='World leaders gather at Arc de Triomphe in Paris for WWI centenary ceremony'}
22:15:57.838 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/newsline/443651-california-fires-death-toll/', text='California fires death toll climbs to 25, as 14 more bodies recovered'}
22:15:57.838 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/newsline/443642-dead-southern-california-wildfire/', text='2 found dead as Southern California wildfire doubles in size'}
22:15:57.838 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/newsline/443629-russian-billionaire-rybolovlev-moscow/', text='Russian billionaire Rybolovlev back in Moscow amid corruption probe in Monaco'}
22:15:57.838 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/newsline/443619-macron-trump-meet-paris/', text='Macron meets Trump in Paris, says Europe should ‘take on more’ of defense burden'}
22:15:57.838 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/newsline/443568-macedonia-emergency-borders-migrants/', text='Macedonia extends state of emergency at borders to reduce migrants’ flows'}
22:15:57.838 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/where-to-watch/', text='Where to watch'}
22:15:57.838 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/schedule/', text='Schedule'}
22:15:57.838 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.newsamizdat.com/', text=''}
22:15:57.838 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/', text='News'}
22:15:57.838 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/russia/443701-winter-is-coming-snow-tsunami/', text=''}
22:15:57.838 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/russia/443701-winter-is-coming-snow-tsunami/', text='Winter is Coming: WATCH snow tsunami devour Siberian city'}
22:15:57.839 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443695-khashoggi-last-words-recording/', text=''Take this bag off my head' were Jamal Khashoggi's last words, Turkish investigative journalist says'}
22:15:57.839 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443692-paris-protest-trump-war/', text='‘Trump means war’: Hundreds protest US president’s WWI centenary visit to Paris (VIDEO)'}
22:15:57.839 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443691-google-berlin-stasi-hq-germany/', text='Make police states hip again? Google offered STASI HQ for new Berlin office'}
22:15:57.839 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/business/', text='Business'}
22:15:57.839 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443687-china-school-bitcoin-mining/', text=''}
22:15:57.839 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/443687-china-school-bitcoin-mining/', text='Principal fired for running secret cryptocurrency mining scheme at school'}
22:15:57.839 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/business/443671-chinas-oil-imports-record-surge/', text='China’s oil imports surge to record high'}
22:15:57.839 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/business/443654-russia-first-cruise-liner/', text='Floating 5-star hotel: Russia’s 1st cruise liner to be ready for tests in 2019'}
22:15:57.839 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/business/443610-japan-fukushima-nuclear-comeback/', text='In wake of Fukushima disaster nuclear energy stages comeback in Japan'}
22:15:57.839 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/op-edge/', text='Op-ed'}
22:15:57.840 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/op-ed/443540-think-tanks-media-disinformation/', text=''}
22:15:57.840 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/op-ed/443540-think-tanks-media-disinformation/', text='Dirty little secret: ‘Think tanks’ are among top culprits in media disinformation crisis'}
22:15:57.840 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/op-ed/443509-pompeo-iran-normal-us/', text='US directs Iran to act like a ‘normal’ country. What is a normal country?'}
22:15:57.840 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/op-ed/443469-trump-election-blue-midterm/', text='Trumped by the President: How the Democrats’ Blue Wave turned into a Blue Rinse. By George Galloway'}
22:15:57.840 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/op-ed/443437-china-us-taiwan-war/', text='Will the ‘Taiwan question’ give rise to a World War III scenario?'}
22:15:57.840 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.youtube.com/watch?v=aweNFPh812o', text=''}
22:15:57.840 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/usa/', text='USA'}
22:15:57.840 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/usa/443703-parody-account-north-korea-mcfaul/', text=''}
22:15:57.840 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/usa/443703-parody-account-north-korea-mcfaul/', text='US diplomats fall for parody North Korean account in their rush to pick on Trump'}
22:15:57.841 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/usa/443698-cnn-could-be-about-to/', text='CNN could be about to sue Trump over Acosta ban'}
22:15:57.841 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/usa/443700-fort-lauderdale-bomb-threat/', text='Bomb squad responds to suspicious package at Fort Lauderdale airport, Florida'}
22:15:57.841 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/usa/443669-firefighter-migrant-hunt-comments/', text='Firefighter sacked over inflammatory ‘joke’ about hunting migrants on US border'}
22:15:57.841 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/sport/', text='Sport'}
22:15:57.841 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/sport/443688-mass-brawl-mma-fight/', text=''}
22:15:57.841 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/sport/443688-mass-brawl-mma-fight/', text='Riot police break up mass brawl at Moscow MMA event after fighter hits opponent after bell (VIDEO)'}
22:15:57.841 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/sport/443680-ufc-denver-yair-rodriguez/', text='UFC’s 25th anniversary event ends with stunning last-second KO (VIDEO)'}
22:15:57.841 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/sport/443676-usyk-bellew-world-title/', text=''Tony Bellew died tonight': Brilliant Oleksandr Usyk defends crown, stops Bellew in the 8th'}
22:15:57.842 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/sport/443649-tuktamysheva-trump-kick-ass/', text='Russian figure-skating 'Empress' says she wants to 'kick Trump's ass' on Twitter & on ice'}
22:15:57.842 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/russia/', text='Russia'}
22:15:57.842 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/russia/443696-khakassia-election-russia-communist/', text=''}
22:15:57.842 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/russia/443696-khakassia-election-russia-communist/', text='Communist party candidate elected head of Russia's Khakassia'}
22:15:57.842 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/russia/443567-police-complaints-russia-kolokoltsev/', text='300 police officers fired in Russia this year after complaints from citizens'}
22:15:57.842 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/russia/443539-food-product-tracking-system/', text='Russia to make food industry more transparent with new field-to-counter product-tracking system'}
22:15:57.842 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/russia/443562-meduza-editor-sexual-harassment/', text='Editor-in-chief of anti-Kremlin online news site resigns amid sexual harassment scandal'}
22:15:57.842 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/', text='Shows'}
22:15:57.842 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/keiser-report/443605-uk-austerity-economic-disaster/', text=''}
22:15:57.843 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/keiser-report/', text='Keiser Report'}
22:15:57.843 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/keiser-report/443605-uk-austerity-economic-disaster/', text='Share buybacks + austerity = economic disaster (E1304)'}
22:15:57.843 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/worlds-apart-oksana-boyko/443659-peggy-hicks-un-human-rights/', text=''}
22:15:57.843 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/worlds-apart-oksana-boyko/', text='Worlds Apart'}
22:15:57.844 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/worlds-apart-oksana-boyko/443659-peggy-hicks-un-human-rights/', text='Rights gone wrong? Peggy Hicks, director at the UN Human Rights Office'}
22:15:57.844 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/on-contact/443658-neoliberalism-opinions-inequality-history/', text=''}
22:15:57.844 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/on-contact/', text='On contact'}
22:15:57.844 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/on-contact/443658-neoliberalism-opinions-inequality-history/', text='A critic of neoliberalism'}
22:15:57.844 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/crosstalk/443524-midterm-elections-democrats-republicans/', text=''}
22:15:57.844 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/crosstalk/', text='CrossTalk'}
22:15:57.844 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/crosstalk/443524-midterm-elections-democrats-republicans/', text='Gridlock warfare'}
22:15:57.844 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/documentary/443566-cadets-fleet-academy-sea/', text=''}
22:15:57.844 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/documentary/', text='Documentary'}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/documentary/443566-cadets-fleet-academy-sea/', text='On deck, offline'}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://rtd.rt.com/series/combat-approved-series/nebo-m-radar-complex-the-stealth-buster/', text='Nebo-M Radar Complex: The Stealth Buster. Tracks missiles, stealth planes & drones'}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://arabic.rt.com/', text='العربية'}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://actualidad.rt.com/', text='esp'}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://russian.rt.com/', text='рус'}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://deutsch.rt.com/', text='de'}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://francais.rt.com/', text='fr'}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://russian.rt.com/inotv', text='инотв'}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://rtd.rt.com/', text='rтд'}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://ruptly.tv/', text='ruptly'}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.facebook.com/RTnews/', text='facebook'}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://twitter.com/RT_com', text='twitter'}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.youtube.com/user/RussiaToday', text='youtube'}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://instagram.com/rt', text='instagram'}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://play.google.com/store/apps/details?id=com.rt.mobile.english', text=''}
22:15:57.845 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://itunes.apple.com/app/rt-news-russia-today/id649316948?mt=8', text=''}
22:15:57.846 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.microsoft.com/ru-ru/store/p/rt-news-english/9wzdncrdn5ns?rtc=1', text=''}
22:15:57.846 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/news/', text='News'}
22:15:57.846 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/usa/', text='USA'}
22:15:57.846 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/uk/', text='UK'}
22:15:57.846 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/russia/', text='Russia'}
22:15:57.846 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/business/', text='Business'}
22:15:57.846 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/sport/', text='Sport'}
22:15:57.846 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/op-ed/', text='Op-ed'}
22:15:57.846 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/360/', text='RT360'}
22:15:57.846 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/shows/', text='Shows'}
22:15:57.846 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/projects/', text='Projects'}
22:15:57.846 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.facebook.com/RTvids/', text='RT Play'}
22:15:57.846 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/on-air/', text='Live'}
22:15:57.847 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/applications/', text='Applications'}
22:15:57.847 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/where-to-watch/', text='Where to watch'}
22:15:57.847 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/schedule/', text='Schedule'}
22:15:57.847 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/sponsored-content/', text='Sponsored content'}
22:15:57.847 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/onair-talent/', text='On-Air Talent'}
22:15:57.847 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/legal-disclaimer/', text='Legal disclaimer'}
22:15:57.847 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/privacy-policy/', text='Privacy policy'}
22:15:57.847 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/about-us/', text='About us'}
22:15:57.847 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/about-us/contact-info/', text='Contact info'}
22:15:57.849 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='mailto:feedback@rttv.ru', text='Feedback'}
22:15:57.849 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/about-us/vacancies/', text='Vacancies'}
22:15:57.849 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/', text=''}
22:15:57.849 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - TextLink{link='https://www.rt.com/privacy-policy/', text='Privacy policy'}
22:15:57.849 [main] INFO org.inepal.products.nlp.crawler.WebCrawler - Found 148 Text Links while crawling https://www.rt.com/
[My Video Promotion]
Tagged , , , , , . Bookmark the permalink.

Leave a Reply

This site uses Akismet to reduce spam. Learn how your comment data is processed.