某音数据分析
参加数据分析认证,如数据分析上岗证或数据分析师证书 #生活技巧# #工作学习技巧# #技能培训认证#
大家好,我是烤鸭:
某音竟然有pc版了,不过搜索的数据有限,会限制条数,亲测只能搜索400条数据,简单分析下过程。
工具使用
java + chromedriver + fiddler
java + selenium 自动化网页,需要登录,可以登录一次共享cookie
@Test public void testXyin() { String keyWord = "旅游"; try { // 调用chrome driver System.setProperty("webdriver.chrome.driver", "D:\\dev\\env\\chromedriver\\chromedriver.exe"); // 共享cookie // ChromeOptions ChromeOptions chromeOptions = new ChromeOptions(); // 添加用户cookies chromeOptions.addArguments( "--user-data-dir=C:\\Users\\user\\AppData\\Local\\Google\\Chrome\\User Data-Cookie"); WebDriver driver = new ChromeDriver(chromeOptions); // 窗口最大化 driver.manage().window().maximize(); driver.get( "https://www.douyin.com/search/" + keyWord + "?publish_time=0&sort_type=0&source=normal_search&type=general"); // 调整高度 ((ChromeDriver) driver).executeScript("window.scrollTo(0, document.body.scrollHeight);"); Thread.sleep(1000); // 构建driver对象 driver.manage().timeouts().implicitlyWait(3, TimeUnit.SECONDS); WebElement webElement = driver.findElement(By.cssSelector("body")); webElement.click(); // 有的时候必须点击一下,下拉才能生效(有的网站是这样,原因未找到) } catch (Exception e) { e.printStackTrace(); } } 123456789101112131415161718192021222324252627282930
fiddler 脚本
修改 Fiddler,Rules—>Customize Rules, 改写 OnBeforeResponse 方法
static function OnBeforeResponse(oSession: Session) {if (m_Hide304s && oSession.responseCode == 304) {oSession["ui-hide"] = "true";}//加在方法末尾if (oSession.HostnameIs("www.douyin.com") && oSession.uriContains("https://www.douyin.com/aweme/v1/web/general/search/single")){var filename = "D:\\data\\dy\\fiddler-token.log";var curDate = new Date();var logContent = "[" + curDate.toLocaleString() + "] " + oSession.GetRequestBodyAsString() + "\r\n"+oSession.GetResponseBodyAsString()+"\r\n";var sw : System.IO.StreamWriter;if (System.IO.File.Exists(filename)){sw = System.IO.File.AppendText(filename);sw.Write(logContent);}else{sw = System.IO.File.CreateText(filename);sw.Write(logContent);}sw.Close();sw.Dispose();}} 1234567891011121314151617181920212223
解析数据
读取文件解析:
public void readText() { ReaderTxt rt = new ReaderTxt(); ArrayList<String> list = rt.InitTxt(); for (int i = 0; i < list.size(); i++) { String txt = list.get(i); if (!txt.startsWith("{")) { continue; } JSONObject jrs = JSONObject.parseObject(txt); JSONArray array = jrs.getJSONArray("data"); for (Object obs : array) { DyScrapVideo scrapVideo = new DyScrapVideo(); JSONObject json = (JSONObject) obs; // aweme_info JSONObject awemeInfo = json.getJSONObject("aweme_info"); if (!Optional.ofNullable(awemeInfo).isPresent()) { continue; } // https://www.douyin.com/video/ + aweme_id 详情页 String aweme_id = awemeInfo.getString("aweme_id"); String desc = awemeInfo.getString("desc"); Long publishTime = awemeInfo.getLong("create_time"); scrapVideo.setVideoDesc(desc); scrapVideo.setAwemeId(aweme_id); scrapVideo.setVideoPublishTime(UnixUtil.TimeStamp2Date(publishTime + "")); // author JSONObject author = awemeInfo.getJSONObject("author"); Long aLong = author.getLong("uid"); String nickname = author.getString("nickname"); String signature = author.getString("signature"); scrapVideo.setAuthorUid(aLong + ""); scrapVideo.setAuthorNickname(nickname); scrapVideo.setAuthorSignature(signature); JSONObject avatar_thumb = author.getJSONObject("avatar_thumb"); JSONArray url_list = avatar_thumb.getJSONArray("url_list"); if (Optional.ofNullable(url_list).isPresent()) { scrapVideo.setAuthorAvatarThumb(url_list.get(0).toString()); } Long follower_count = author.getLong("follower_count"); scrapVideo.setFollowerCount(follower_count != null ? follower_count.intValue() : 0); String custom_verify = author.getString("custom_verify"); scrapVideo.setCustomVerify(custom_verify); // video JSONObject video = awemeInfo.getJSONObject("video"); if(video != null){ JSONObject download_addr = video.getJSONObject("download_addr"); if(download_addr != null){ JSONArray down_url_list = download_addr.getJSONArray("url_list"); if (Optional.ofNullable(down_url_list).isPresent()) { scrapVideo.setVideoDownloadAddr(UnicodeUtil.unicodeToCN(down_url_list.get(0).toString())); } } Integer duration = video.getInteger("duration"); scrapVideo.setVideoDuration(duration); } // statistics JSONObject statistics = awemeInfo.getJSONObject("statistics"); if(statistics != null){ Integer comment_count = statistics.getInteger("comment_count"); Integer digg_count = statistics.getInteger("digg_count"); Integer download_count = statistics.getInteger("download_count"); Integer play_count = statistics.getInteger("play_count"); Integer share_count = statistics.getInteger("share_count"); Integer collect_count = statistics.getInteger("collect_count"); scrapVideo.setCommentCount(comment_count); scrapVideo.setDiggCount(digg_count); scrapVideo.setDownloadCount(download_count); scrapVideo.setPlayCount(play_count); scrapVideo.setShareCount(share_count); scrapVideo.setCollectCount(collect_count); } scrapVideo.setCreateDate(new Date()); scrapVideo.setSearchKeyword("北京旅游"); } } } public ArrayList<String> InitTxt() { ArrayList<String> list = new ArrayList<String>(); try { // 防止文件建立或读取失败,用catch捕捉错误并打印,也可以throw /* 读入TXT文件 */ String pathname = "D:\\data\\fiddler-token.log"; // 绝对路径或相对路径都可以,这里是绝对路径,写入文件时演示相对路径 File filename = new File(pathname); InputStreamReader reader = new InputStreamReader(new FileInputStream(filename), "utf-8"); // 建立一个输入流对象reader BufferedReader br = new BufferedReader(reader); // 建立一个对象,它把文件内容转成计算机能读懂的语言 String line = ""; while (line != null) { line = br.readLine(); // 一次读入一行数据 if (line == null) { break; } list.add(line); } } catch (Exception e) { e.printStackTrace(); } return list; } 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
实体对象:
package com.machu.picchu.crawler.dto; import java.util.Date; public class DyScrapVideo { private Integer id; private String awemeId; private String videoDesc; private Date videoPublishTime; private String videoDownloadAddr; private Integer videoDuration; private Integer commentCount; private Integer diggCount; private Integer playCount; private Integer downloadCount; private Integer shareCount; private Integer collectCount; private String authorUid; private String authorNickname; private String authorSignature; private String authorAvatarThumb; private Integer followerCount; private String customVerify; private Date createDate; private Date publishDate; private String searchKeyword; private String memo; private Integer status; public Integer getId() { return id; } public void setId(Integer id) { this.id = id; } public String getVideoDesc() { return videoDesc; } public void setVideoDesc(String videoDesc) { this.videoDesc = videoDesc; } public Date getVideoPublishTime() { return videoPublishTime; } public void setVideoPublishTime(Date videoPublishTime) { this.videoPublishTime = videoPublishTime; } public String getVideoDownloadAddr() { return videoDownloadAddr; } public void setVideoDownloadAddr(String videoDownloadAddr) { this.videoDownloadAddr = videoDownloadAddr; } public Integer getVideoDuration() { return videoDuration; } public void setVideoDuration(Integer videoDuration) { this.videoDuration = videoDuration; } public Integer getCommentCount() { return commentCount; } public void setCommentCount(Integer commentCount) { this.commentCount = commentCount; } public Integer getDiggCount() { return diggCount; } public void setDiggCount(Integer diggCount) { this.diggCount = diggCount; } public Integer getPlayCount() { return playCount; } public void setPlayCount(Integer playCount) { this.playCount = playCount; } public Integer getDownloadCount() { return downloadCount; } public void setDownloadCount(Integer downloadCount) { this.downloadCount = downloadCount; } public Integer getShareCount() { return shareCount; } public void setShareCount(Integer shareCount) { this.shareCount = shareCount; } public Integer getCollectCount() { return collectCount; } public void setCollectCount(Integer collectCount) { this.collectCount = collectCount; } public String getAuthorUid() { return authorUid; } public void setAuthorUid(String authorUid) { this.authorUid = authorUid; } public String getAuthorNickname() { return authorNickname; } public void setAuthorNickname(String authorNickname) { this.authorNickname = authorNickname; } public String getAuthorSignature() { return authorSignature; } public void setAuthorSignature(String authorSignature) { this.authorSignature = authorSignature; } public String getAuthorAvatarThumb() { return authorAvatarThumb; } public void setAuthorAvatarThumb(String authorAvatarThumb) { this.authorAvatarThumb = authorAvatarThumb; } public Integer getFollowerCount() { return followerCount; } public void setFollowerCount(Integer followerCount) { this.followerCount = followerCount; } public String getCustomVerify() { return customVerify; } public void setCustomVerify(String customVerify) { this.customVerify = customVerify; } public Date getCreateDate() { return createDate; } public void setCreateDate(Date createDate) { this.createDate = createDate; } public Date getPublishDate() { return publishDate; } public void setPublishDate(Date publishDate) { this.publishDate = publishDate; } public String getSearchKeyword() { return searchKeyword; } public void setSearchKeyword(String searchKeyword) { this.searchKeyword = searchKeyword; } public String getMemo() { return memo; } public void setMemo(String memo) { this.memo = memo; } public Integer getStatus() { return status; } public void setStatus(Integer status) { this.status = status; } public String getAwemeId() { return awemeId; } public void setAwemeId(String awemeId) { this.awemeId = awemeId; } } 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
网址:某音数据分析 https://www.yuejiaxmz.com/news/view/202835
相关内容
生存数据分析人人都是数据分析师:到底什么是数据分析?如何进行数据分析?
python数据分析
全面的数据,全面数据分析与报告
2 数据分析EDA
工作报告数据分析.docx
9种最常用数据分析方法!
个人消费情况怎么分析数据
生活数据化案例分析报告怎么写
头条生活领域细分怎么看数据分析