CrawlDataProvider.cs 5.3 KB
Newer Older
Ken's avatar
Ken committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
using System.Net.Http;
using System;
using System.Text.RegularExpressions;
using System.Net;
using News_site.Models;
using System.Threading.Tasks;
using Microsoft.EntityFrameworkCore;
using System.Linq;
using System.Collections.Generic;
using News_site.Services;
using System.Globalization;

namespace News_site.Provider
{
	public class CrawlDataProvider : BaseProvider
	{
		private readonly string baseUrl = "https://tuoitre.vn/";
		private readonly HttpClient httpClient = new HttpClient();
		private readonly HandleExceptionAsync _handleExceptionAsync = new HandleExceptionAsync();

		public string CrawlDataFromUrl(string url = "", string absoluteUrl = "")
		{
			try
			{
				string targetUrl = absoluteUrl == "" ? baseUrl + url : absoluteUrl + url;
				string html = WebUtility.HtmlDecode(httpClient.GetStringAsync(targetUrl).Result);

				return html;
			} catch (Exception error)
			{
				_handleExceptionAsync.WriteError(error);
				return "";
			}
		}

		public async Task CrawlLatestNews()
		{
			var newsList = new List<Newspaper>();
			var newestSite = "tin-moi-nhat.htm";
			string htmlData = CrawlDataFromUrl(newestSite);
			var newsListHtml = Regex.Matches(htmlData, @"<div class=""box-category-item(.*?)</p>", RegexOptions.Singleline);

			foreach (var newsItem in newsListHtml)
			{
				string newsUrl = Regex.Match(newsItem.ToString(), @"href=""(.*?)""", RegexOptions.Singleline).Value.Replace("href=\"", "").Replace("\"", "");

				string newsTitleElem = Regex.Match(newsItem.ToString(), @"<h3(.*?)</h3>", RegexOptions.Singleline).Value;
				string newsTitle = Regex.Match(newsTitleElem, @"title=""(.*?)""", RegexOptions.Singleline).Value.Replace("title=\"", "").Replace("\"", "");
				string newsImgUrl = Regex.Match(newsItem.ToString(), @"src=""(.*?)""", RegexOptions.Singleline).Value.Replace("src=\"", "").Replace("\"", "");

				string newsDescriptionElem = Regex.Match(newsItem.ToString(), @"<p(.*?)</p>", RegexOptions.Singleline).Value;
				string newsDescription = Regex.Match(newsDescriptionElem, @">(.*?)<", RegexOptions.Singleline).Value.Replace(">", "").Replace("<", "");

				string newsCategoryElem = Regex.Match(newsItem.ToString(), @"<a class=""box-category-category(.*?)</a>", RegexOptions.Singleline).Value;
				string newsCategory = Regex.Match(newsCategoryElem, @">(.*?)<", RegexOptions.Singleline).Value.Replace(">", "").Replace("<", "");

				var categoryInDb = await db.Categories.Where(category => category.Label.ToLower().Equals(newsCategory.ToLower())).FirstOrDefaultAsync();
				bool isAbsoluteCategoryLink = !(categoryInDb is null) && categoryInDb.Link.Contains("https");
				string newsData = isAbsoluteCategoryLink ? CrawlDataFromUrl(newsUrl, categoryInDb.Link) : CrawlDataFromUrl(newsUrl);

				if (newsData != "")
				{
					string newsTime = Regex.Match(newsData, @"publishdate(.*?)G", RegexOptions.Singleline).Value.Replace("publishdate\">", "").Replace("G", "").Trim();
					string newsContent = Regex.Match(newsData, @"<div class=""detail-content(.*?)RelatedOneNews(.*?)</div>", RegexOptions.Singleline).Value + "</div>";

					string checkCategoryLinkElem = Regex.Match(newsData, @"<div class=""detail-cate(.*?)</a>", RegexOptions.Singleline).Value;
					string checkCategoryLink = Regex.Match(checkCategoryLinkElem, @"href=""(.*?)""", RegexOptions.Singleline).Value.Replace("href=\"", "").Replace("\"", "");

lap nguyen's avatar
lap nguyen committed
69 70 71 72 73 74 75 76 77 78 79
					var allNews = await db.Newspapers.Select(news => new
					{
						news.Title
					}).AsNoTracking().ToListAsync();
					var allCagegories = await db.Categories.Select(category => new
					{
						category.Link
					}).AsNoTracking().ToListAsync();

					bool isNewspaperCrawled = allNews.Any(news => news.Title.Equals(newsTitle));
					bool isCategoryExisted = allCagegories.Any(category => category.Link.Equals(checkCategoryLink));
Ken's avatar
Ken committed
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132

					if (!isNewspaperCrawled && isCategoryExisted)
					{
						Newspaper newspaper = new Newspaper
						{
							Id = Guid.NewGuid(),
							Image = newsImgUrl,
							Title = newsTitle,
							Description = newsDescription,
							Categorylink = checkCategoryLink,
							CreatedAt = DateTime.ParseExact(newsTime, "dd/MM/yyyy HH:mm", CultureInfo.CurrentCulture),
							Content = newsContent,
						};

						newsList.Add(newspaper);
					}
				}
			}

			db.Newspapers.AddRange(newsList);
			await db.SaveChangesAsync();
		}

		public async Task CrawlCategories()
		{
			var categoryList = new List<Category>();
			string htmlData = CrawlDataFromUrl();
			var categoryListHtml = Regex.Match(htmlData, @"<ul class=""menu-nav(.*?)</ul>", RegexOptions.Singleline).Value;
			var categoryListItemHtml = Regex.Matches(categoryListHtml, @"<li(.*?)</li>", RegexOptions.Singleline);

			foreach (var category in categoryListItemHtml)
			{
				string categoryTitle = Regex.Match(category.ToString(), @"title=""(.*?)""", RegexOptions.Singleline).Value.Replace("title=\"", "").Replace("\"", "");
				string categoryLink = Regex.Match(category.ToString(), @"href=""(.*?)""", RegexOptions.Singleline).Value.Replace("href=\"", "").Replace("\"", "");

				bool isCategoryExisted = await db.Categories.AnyAsync(item => item.Link.Equals(categoryLink));

				if (!isCategoryExisted)
				{
					Category newCategory = new Category
					{
						Label = categoryTitle,
						Link = categoryLink
					};

					categoryList.Add(newCategory);
				}
			}
			db.Categories.AddRange(categoryList);
			await db.SaveChangesAsync();
		}
	}
}